created transform method

2026-07-27 23:35:01 +00:00 · 2024-08-09 11:07:16 +01:00 · 2024-08-09 11:07:16 +01:00 · 73be979c29
commit 73be979c29
parent 3002a2c740
5 changed files with 113 additions and 3 deletions
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@ -586,7 +586,7 @@ class EnergyConsumptionModel:

    def estimate_new_consumption(self, current_energy_efficiency, target_efficiency, current_consumption):
        """
-        Given then consumption_averages dataset, which is produced as a result of the data_combining.py script,
+        Given then consumption_averages dataset, which is produced as a result of the training_data.py script,
        for the energy kwh models, this function will estimate the new consumption based on the current consumption,
        based on the expected reduction in consumption from the current rating to the target rating.
        :param current_energy_efficiency:
--- a/etl/bill_savings/KwhData.py
+++ b/etl/bill_savings/KwhData.py
@ -1,5 +1,6 @@
 import re
 import pandas as pd
+import numpy as np
 from datetime import datetime
 from tqdm import tqdm
 from utils.logger import setup_logger
@ -11,6 +12,23 @@ logger = setup_logger()
 class KwhData:
    COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]

+    CATEGORICAL_COLUMNS = [
+        "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
+        "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
+        "built-form",
+        "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
+        "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
+        "county",
+        "windows-description", "windows-energy-eff", "flat-top-storey",
+        "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
+        "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
+    ]
+
+    NUMERICAL_COLUMNS = [
+        'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
+        'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
+    ]
+
    def __init__(self, bucket):
        self.run_date = datetime.now().strftime("%Y-%m-%d")
        self.bucket = bucket
@ -18,6 +36,7 @@ class KwhData:

        self.consumption_data_filepath = None
        self.consumption_averages_filepath = None
+        self.model_training_data_filepath = None

    @staticmethod
    def extract_kwh_value(text: str):
@ -116,3 +135,84 @@ class KwhData:
        )

        self.data = df
+
+    def transform(
+        self, data: pd.DataFrame, cleaned, new=False, save=False
+    ):
+        """
+        Given the input EPCs, this method will transform the data into a format that can be used by the model
+        This method can be used to transform the training data, or new epcs within the backend engine
+        :return:
+        """
+
+        # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
+        #       in anticipation of the new model
+
+        data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
+        data["lodgement-year"] = data["lodgement-date"].dt.year
+        data["lodgement-month"] = data["lodgement-date"].dt.month
+
+        # For walls, roof, floor description where we have average thermal transmittance, to avoid too many
+        # categories
+        # we group them
+        ranges = {
+            "lessthan 0.1": (0, 0.1),
+            "0.1 - 0.3": (0.1, 0.3),
+            "0.3 - 0.5": (0.3, 0.5),
+            "morethan 0.5": (0.5, 2.5),
+        }
+
+        # Generate the lookup table
+        thermal_transmittance_lookup_table = []
+        for i in range(1, 251):
+            value = i / 100
+            for label, (low, high) in ranges.items():
+                if low < value <= high:
+                    thermal_transmittance_lookup_table.append({"from": value, "to": label})
+                    break
+
+        # Convert to DataFrame for display
+        thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
+        thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
+
+        # Apply the lookup table to the data
+        for feature in ["walls-description", "roof-description", "floor-description"]:
+            cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
+            # Round to 2 decimal places and convert to string
+            cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
+
+            data = data.merge(
+                cleaned_df,
+                how="left",
+                left_on=feature,
+                right_on="original_description",
+            )
+            # We now have the thermal transmittance in the data, which we can use to group with the lookup table
+            data = data.merge(
+                thermal_transmittance_lookup_table,
+                how="left",
+                left_on="thermal_transmittance",
+                right_on="from",
+            )
+            # Where "to" is populated, replace feature with to
+            data[feature] = np.where(
+                ~pd.isnull(data["to"]),
+                data["to"],
+                data[feature]
+            )
+            data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
+
+        data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
+        data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
+
+        # Create new features:
+        data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
+
+        if save:
+            self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
+            logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
+            save_dataframe_to_s3_parquet(
+                bucket_name=self.bucket,
+                file_key=self.model_training_data_filepath,
+                df=data
+            )
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@ -7,7 +7,7 @@ import inspect
 import pandas as pd
 from tqdm import tqdm
 from bs4 import BeautifulSoup
-from etl.epc.settings import EARLIEST_EPC_DATE
+from training_data.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
 import numpy as np
 from utils.s3 import save_pickle_to_s3
--- a/etl/bill_savings/training.py
+++ b/etl/bill_savings/training.py
@ -1,7 +1,7 @@
 from pprint import pprint
 import msgpack
 from utils.s3 import read_from_s3
-from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
+from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel


 def handler():
--- a/etl/bill_savings/data_combining.py
+++ b/etl/bill_savings/data_combining.py
@ -1,4 +1,6 @@
+import msgpack
 from etl.bill_savings.KwhData import KwhData
+from utils.s3 import read_from_s3


 def app():
@ -8,5 +10,13 @@ def app():
    :return:
    """

+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
    kwh_data_client = KwhData(bucket="retrofit-datalake-dev")
    kwh_data_client.combine()
+    kwh_data_client.transform(data=kwh_data_client.data, cleaned=cleaned, save=True)