Model/etl/bill_savings/KwhData.py

import re
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
from utils.logger import setup_logger
from utils.s3 import (
    list_files_in_s3_folder, read_pickle_from_s3, save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet,
    read_csv_from_s3
)
from backend.Property import Property

logger = setup_logger()


class KwhData:
    COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]

    CATEGORICAL_COLUMNS = [
        "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
        "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
        "built-form",
        "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
        "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
        "county",
        "windows-description", "windows-energy-eff", "flat-top-storey",
        "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
        "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
        "floor-level"
    ]

    NUMERICAL_COLUMNS = [
        'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
        'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
    ]

    def __init__(self, bucket=None, read_consumption_data=False):
        self.run_date = datetime.now().strftime("%Y-%m-%d")
        self.bucket = bucket
        self.data = None

        self.consumption_data_filepath = None
        self.consumption_averages_filepath = None
        self.model_training_data_filepath = None

        self.consumption_averages = None
        self.retail_price_comparison = None
        if read_consumption_data:
            self.get_consumption_data()
            self.read_retail_price_comparison()

    def get_consumption_data(self):

        # Look for the latest version of this file
        s3_contents = list_files_in_s3_folder(bucket_name=self.bucket, folder_name="energy_consumption/")
        consumption_averages = [
            {"run_date": pd.to_datetime(x.split("/")[1]), "filepath": x}
            for x in s3_contents if "consumption_averages.parquet" in x
        ]
        # Get the file with the soonest run date
        consumption_averages = sorted(consumption_averages, key=lambda x: x["run_date"])
        if not consumption_averages:
            raise ValueError("No consumption averages data found, something went wrong")

        self.consumption_averages = read_dataframe_from_s3_parquet(
            bucket_name=self.bucket,
            file_key=consumption_averages[-1]["filepath"]
        )

    def read_retail_price_comparison(self):
        data = read_csv_from_s3(
            bucket_name=self.bucket,
            filepath="energy_consumption/retail-price-comparison.csv"
        )
        header = ['Date', 'Average standard variable tariff (Large legacy suppliers)',
                  'Average standard variable tariff (Other suppliers)', 'Average fixed tariff',
                  'Cheapest tariff (Large legacy suppliers)', 'Cheapest tariff (All suppliers)',
                  'Cheapest tariff (Basket)', 'Default tariff cap level']

        # Extract data rows
        data_rows = []
        for row in data[1:]:
            date = row['\ufeff"']
            values = row[None]
            data_rows.append([date] + values)

        self.retail_price_comparison = pd.DataFrame(data_rows, columns=header)
        self.retail_price_comparison['Date'] = pd.to_datetime(self.retail_price_comparison['Date'], errors='coerce')

    @staticmethod
    def extract_kwh_value(text: str):
        """
        Extract the numerical kWh value from a given string.

        :param text: The input string containing the kWh value.
        :return: The extracted numerical kWh value as an integer.
        """
        # Use regular expression to find the numerical value followed by "kWh per year"
        match = re.search(r'([\d,]+) kWh per year', text)

        if match:
            # Remove commas from the extracted value and convert to integer
            kwh_value = int(match.group(1).replace(',', ''))
            return kwh_value
        else:
            # If no match is found, return None or raise an exception
            return None

    def combine(self):
        """
        Given the data that is collected containing the kwh values for heating and hot water, this method will combine
        and save the data
        :return:
        """

        # Firstly, list all of the saved files in s3
        data_files = list_files_in_s3_folder(bucket_name="retrofit-datalake-dev", folder_name="energy_consumption_data")

        complete_data = []
        for files in tqdm(data_files):
            dataset_run_date = files.split("/")[-1].split(".")[0]
            # Extract the date from the file name
            dataset_run_date = pd.Timestamp(dataset_run_date)

            # Load the data from the file
            data = read_pickle_from_s3(bucket_name="retrofit-datalake-dev", s3_file_name=files)

            # We check that the retrieved energy consumption sufficiently matches the EPC data
            internal_dataset = []
            for x in data:
                epc_data = x["epc"]
                epc_sap = epc_data["current-energy-efficiency"]
                epc_potential_sap = epc_data["potential-energy-efficiency"]
                # Make sure this matches the extracted sap
                if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
                    x["potential_epc_efficiency"]
                ):
                    continue

                heating_kwh = self.extract_kwh_value(x["heating_text"])
                hot_water_kwh = self.extract_kwh_value(x["hot_water_text"])
                internal_dataset.append(
                    {
                        **epc_data,
                        "heating_kwh": heating_kwh,
                        "hot_water_kwh": hot_water_kwh,
                        "dataset_run_date": dataset_run_date
                    }
                )

            complete_data.extend(internal_dataset)

        df = pd.DataFrame(complete_data)
        # Because we collate multiple runs into a single data source, it's possible that we have duplicated data at
        # the uprn level, so we dedupe based on the newest dataset_run_date

        df = df.sort_values("dataset_run_date", ascending=False).drop_duplicates(subset="uprn", keep="first")
        df = df.drop(columns=["dataset_run_date"])

        for col in self.COLS_TO_STRINGIFY:
            df[col] = df[col].astype(str)

        # Save the data back to s3, but this time as a parquet file
        self.consumption_data_filepath = f"energy_consumption/{self.run_date}/energy_consumption_dataset.parquet"
        logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
        save_dataframe_to_s3_parquet(
            bucket_name=self.bucket,
            file_key=self.consumption_data_filepath,
            df=df
        )

        # We also estimate the energy consumption reduction from this data, by band
        df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
        consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
        df = df.drop(columns=["total_consumption"])

        self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
        logger.info(f"Storing consumption averages in s3 at {self.consumption_averages_filepath}")
        # Save the consumption averages back to s3
        save_dataframe_to_s3_parquet(
            bucket_name="retrofit-data-dev",
            file_key=self.consumption_averages_filepath,
            df=consumption_averages
        )

        self.data = df

    def transform(
        self, data: pd.DataFrame, cleaned, new=False, save=False
    ):
        """
        Given the input EPCs, this method will transform the data into a format that can be used by the model
        This method can be used to transform the training data, or new epcs within the backend engine
        :return:
        """
        if save and self.bucket is None:
            raise Exception("bucket not set, cannot save data")

        if data.empty:
            # If we have no data
            return data

        # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
        #       in anticipation of the new model

        data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
        data["lodgement-year"] = data["lodgement-date"].dt.year
        data["lodgement-month"] = data["lodgement-date"].dt.month

        # For walls, roof, floor description where we have average thermal transmittance, to avoid too many
        # categories
        # we group them
        ranges = {
            "lessthan 0.1": (0, 0.1),
            "0.1 - 0.3": (0.1, 0.3),
            "0.3 - 0.5": (0.3, 0.5),
            "morethan 0.5": (0.5, 2.5),
        }

        # Generate the lookup table
        thermal_transmittance_lookup_table = []
        for i in range(1, 251):
            value = i / 100
            for label, (low, high) in ranges.items():
                if low < value <= high:
                    thermal_transmittance_lookup_table.append({"from": value, "to": label})
                    break

        # Convert to DataFrame for display
        thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
        thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)

        # Apply the lookup table to the data
        for feature in ["walls-description", "roof-description", "floor-description"]:
            cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
            # Round to 2 decimal places and convert to string
            cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)

            data = data.merge(
                cleaned_df,
                how="left",
                left_on=feature,
                right_on="original_description",
            )
            # We now have the thermal transmittance in the data, which we can use to group with the lookup table
            data = data.merge(
                thermal_transmittance_lookup_table,
                how="left",
                left_on="thermal_transmittance",
                right_on="from",
            )
            # Where "to" is populated, replace feature with to
            data[feature] = np.where(
                ~pd.isnull(data["to"]),
                data["to"],
                data[feature]
            )
            data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])

        data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
        data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)

        # Create new features:
        data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']

        # Ensure this is string, because we could have mixed types
        data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)

        if save:
            self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
            logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
            save_dataframe_to_s3_parquet(
                bucket_name=self.bucket,
                file_key=self.model_training_data_filepath,
                df=data
            )
            return

        return data

    @staticmethod
    def _prepare_epc(p: Property):
        """
        Given an instance of the property class, this method will ensure that the EPC is ready for scoring with the
        kwh models. In the backend, we perform some cleaning and transformation on an EPC so we just ensure that the
        data is in the format required by the model
        :return:
        """

        epc = p.epc_record.to_dict(case="kebab", source="prepared")
        numeric_cols = [
            'current-energy-efficiency',
            'potential-energy-efficiency', 'environment-impact-current',
            'environment-impact-potential', 'energy-consumption-current',
            'energy-consumption-potential', 'co2-emissions-current',
            'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
            'lighting-cost-current', 'lighting-cost-potential',
            'heating-cost-current', 'heating-cost-potential',
            'hot-water-cost-current', 'hot-water-cost-potential',
            'total-floor-area', 'multi-glaze-proportion',
            'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
            'low-energy-lighting', 'number-open-fireplaces',
            'wind-turbine-count', 'unheated-corridor-length',
            'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
            'low-energy-fixed-light-count',
        ]
        for v in numeric_cols:
            if epc[v] is not None:
                epc[v] = float(epc[v])

        bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
        bool_map = {
            True: "Y",
            False: "N",
            None: "N",
            "Y": "Y",
            "N": "N",
        }
        for v in bools_to_remap:
            epc[v] = bool_map[epc[v]]

        no_data = {
            "floor-level": "NODATA!",
            "floor-energy-eff": "NO DATA!"
        }
        for v, fill_val in no_data.items():
            if pd.isnull(epc[v]):
                epc[v] = fill_val

        return epc

    def prepare_epc(self, input_properties: list[Property]):
        scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
        scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
        scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month

        scoring_data["id"] = scoring_data["uprn"].copy()

        return scoring_data

    def convert_cost_to_today(self, original_cost, lodgement_date):
        """
        Given energy costs in an EPC, this function converts that energy cost to a figure based on today's energy costs
        (or as close to today as possible)
        :param original_cost: The original energy cost
        :param lodgement_date: The date the EPC was lodged
        :return:
        """
        closest_date = self.retail_price_comparison.iloc[
            (self.retail_price_comparison['Date'] - lodgement_date).abs().argsort()[:1]
        ]['Date'].values[0]
        closest_date = pd.Timestamp(closest_date)

        # Extract the tariff price on the closest date
        tariff_2024 = self.retail_price_comparison[
            self.retail_price_comparison['Date'] == closest_date
            ]['Average standard variable tariff (Large legacy suppliers)'].values[0]

        # Extract the latest available tariff price
        latest_tariff = self.retail_price_comparison[
            'Average standard variable tariff (Large legacy suppliers)'
        ].iloc[-1]

        # Calculate the ratio
        ratio = float(latest_tariff) / float(tariff_2024)

        # Calculate the updated heating cost
        updated_cost = original_cost * ratio

        return updated_cost