import re import pandas as pd import numpy as np from datetime import datetime from tqdm import tqdm from utils.logger import setup_logger from utils.s3 import ( list_files_in_s3_folder, read_pickle_from_s3, save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet, read_csv_from_s3 ) from backend.Property import Property logger = setup_logger() class KwhData: COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"] CATEGORICAL_COLUMNS = [ "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form", "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff", "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description", "county", "windows-description", "windows-energy-eff", "flat-top-storey", "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating", "floor-level" ] NUMERICAL_COLUMNS = [ 'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current', 'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency' ] def __init__(self, bucket=None, read_consumption_data=False): self.run_date = datetime.now().strftime("%Y-%m-%d") self.bucket = bucket self.data = None self.consumption_data_filepath = None self.consumption_averages_filepath = None self.model_training_data_filepath = None self.consumption_averages = None self.retail_price_comparison = None if read_consumption_data: self.get_consumption_data() self.read_retail_price_comparison() def get_consumption_data(self): # Look for the latest version of this file s3_contents = list_files_in_s3_folder(bucket_name=self.bucket, folder_name="energy_consumption/") consumption_averages = [ {"run_date": pd.to_datetime(x.split("/")[1]), "filepath": x} for x in s3_contents if "consumption_averages.parquet" in x ] # Get the file with the soonest run date consumption_averages = sorted(consumption_averages, key=lambda x: x["run_date"]) if not consumption_averages: raise ValueError("No consumption averages data found, something went wrong") self.consumption_averages = read_dataframe_from_s3_parquet( bucket_name=self.bucket, file_key=consumption_averages[-1]["filepath"] ) def read_retail_price_comparison(self): data = read_csv_from_s3( bucket_name=self.bucket, filepath="energy_consumption/retail-price-comparison.csv" ) header = ['Date', 'Average standard variable tariff (Large legacy suppliers)', 'Average standard variable tariff (Other suppliers)', 'Average fixed tariff', 'Cheapest tariff (Large legacy suppliers)', 'Cheapest tariff (All suppliers)', 'Cheapest tariff (Basket)', 'Default tariff cap level'] self.retail_price_comparison = pd.DataFrame(data) self.retail_price_comparison.columns = header self.retail_price_comparison['Date'] = pd.to_datetime(self.retail_price_comparison['Date'], errors='coerce') @staticmethod def extract_kwh_value(text: str): """ Extract the numerical kWh value from a given string. :param text: The input string containing the kWh value. :return: The extracted numerical kWh value as an integer. """ # Use regular expression to find the numerical value followed by "kWh per year" match = re.search(r'([\d,]+) kWh per year', text) if match: # Remove commas from the extracted value and convert to integer kwh_value = int(match.group(1).replace(',', '')) return kwh_value else: # If no match is found, return None or raise an exception return None def combine(self): """ Given the data that is collected containing the kwh values for heating and hot water, this method will combine and save the data :return: """ # Firstly, list all of the saved files in s3 data_files = list_files_in_s3_folder(bucket_name="retrofit-datalake-dev", folder_name="energy_consumption_data") complete_data = [] for files in tqdm(data_files): dataset_run_date = files.split("/")[-1].split(".")[0] # Extract the date from the file name dataset_run_date = pd.Timestamp(dataset_run_date) # Load the data from the file data = read_pickle_from_s3(bucket_name="retrofit-datalake-dev", s3_file_name=files) # We check that the retrieved energy consumption sufficiently matches the EPC data internal_dataset = [] for x in data: epc_data = x["epc"] epc_sap = epc_data["current-energy-efficiency"] epc_potential_sap = epc_data["potential-energy-efficiency"] # Make sure this matches the extracted sap if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int( x["potential_epc_efficiency"] ): continue heating_kwh = self.extract_kwh_value(x["heating_text"]) hot_water_kwh = self.extract_kwh_value(x["hot_water_text"]) internal_dataset.append( { **epc_data, "heating_kwh": heating_kwh, "hot_water_kwh": hot_water_kwh, "dataset_run_date": dataset_run_date } ) complete_data.extend(internal_dataset) df = pd.DataFrame(complete_data) # Because we collate multiple runs into a single data source, it's possible that we have duplicated data at # the uprn level, so we dedupe based on the newest dataset_run_date df = df.sort_values("dataset_run_date", ascending=False).drop_duplicates(subset="uprn", keep="first") df = df.drop(columns=["dataset_run_date"]) for col in self.COLS_TO_STRINGIFY: df[col] = df[col].astype(str) # Save the data back to s3, but this time as a parquet file self.consumption_data_filepath = f"energy_consumption/{self.run_date}/energy_consumption_dataset.parquet" logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}") save_dataframe_to_s3_parquet( bucket_name=self.bucket, file_key=self.consumption_data_filepath, df=df ) # We also estimate the energy consumption reduction from this data, by band df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"] consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index() df = df.drop(columns=["total_consumption"]) self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet" logger.info(f"Storing consumption averages in s3 at {self.consumption_averages_filepath}") # Save the consumption averages back to s3 save_dataframe_to_s3_parquet( bucket_name="retrofit-data-dev", file_key=self.consumption_averages_filepath, df=consumption_averages ) self.data = df def transform( self, data: pd.DataFrame, cleaned, new=False, save=False ): """ Given the input EPCs, this method will transform the data into a format that can be used by the model This method can be used to transform the training data, or new epcs within the backend engine :return: """ if save and self.bucket is None: raise Exception("bucket not set, cannot save data") if data.empty: # If we have no data return data # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features # in anticipation of the new model data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], format="mixed", errors="coerce") data["lodgement-year"] = data["lodgement-date"].dt.year data["lodgement-month"] = data["lodgement-date"].dt.month # For walls, roof, floor description where we have average thermal transmittance, to avoid too many # categories # we group them ranges = { "lessthan 0.1": (0, 0.1), "0.1 - 0.3": (0.1, 0.3), "0.3 - 0.5": (0.3, 0.5), "morethan 0.5": (0.5, 2.5), } # Generate the lookup table thermal_transmittance_lookup_table = [] for i in range(1, 251): value = i / 100 for label, (low, high) in ranges.items(): if low < value <= high: thermal_transmittance_lookup_table.append({"from": value, "to": label}) break # Convert to DataFrame for display thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table) thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) # Apply the lookup table to the data for feature in ["walls-description", "roof-description", "floor-description"]: cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]] # Round to 2 decimal places and convert to string cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) data = data.merge( cleaned_df, how="left", left_on=feature, right_on="original_description", ) # We now have the thermal transmittance in the data, which we can use to group with the lookup table data = data.merge( thermal_transmittance_lookup_table, how="left", left_on="thermal_transmittance", right_on="from", ) # Where "to" is populated, replace feature with to data[feature] = np.where( ~pd.isnull(data["to"]), data["to"], data[feature] ) data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"]) data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric) data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str) # Create new features: data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area'] # Ensure this is string, because we could have mixed types data["lodgement-datetime"] = data["lodgement-datetime"].astype(str) if save: self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet" logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}") save_dataframe_to_s3_parquet( bucket_name=self.bucket, file_key=self.model_training_data_filepath, df=data ) return return data @staticmethod def _prepare_epc(p: Property): """ Given an instance of the property class, this method will ensure that the EPC is ready for scoring with the kwh models. In the backend, we perform some cleaning and transformation on an EPC so we just ensure that the data is in the format required by the model :return: """ epc = p.epc_record.to_dict(case="kebab", source="prepared") numeric_cols = [ 'current-energy-efficiency', 'potential-energy-efficiency', 'environment-impact-current', 'environment-impact-potential', 'energy-consumption-current', 'energy-consumption-potential', 'co2-emissions-current', 'co2-emiss-curr-per-floor-area', 'co2-emissions-potential', 'lighting-cost-current', 'lighting-cost-potential', 'heating-cost-current', 'heating-cost-potential', 'hot-water-cost-current', 'hot-water-cost-potential', 'total-floor-area', 'multi-glaze-proportion', 'extension-count', 'number-habitable-rooms', 'number-heated-rooms', 'low-energy-lighting', 'number-open-fireplaces', 'wind-turbine-count', 'unheated-corridor-length', 'floor-height', 'photo-supply', 'fixed-lighting-outlets-count', 'low-energy-fixed-light-count', ] for v in numeric_cols: if epc[v] is not None: epc[v] = float(epc[v]) bools_to_remap = ['mains-gas-flag', 'flat-top-storey'] bool_map = { True: "Y", False: "N", None: "N", "Y": "Y", "N": "N", } for v in bools_to_remap: epc[v] = bool_map[epc[v]] no_data = { "floor-level": "NODATA!", "floor-energy-eff": "NO DATA!" } for v, fill_val in no_data.items(): if pd.isnull(epc[v]): epc[v] = fill_val return epc def prepare_epc(self, input_properties: list[Property]): scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties]) scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"], format="mixed").dt.year scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"], format="mixed").dt.month scoring_data["id"] = scoring_data["uprn"].copy() return scoring_data def convert_cost_to_today(self, original_cost, lodgement_date): """ Given energy costs in an EPC, this function converts that energy cost to a figure based on today's energy costs (or as close to today as possible) :param original_cost: The original energy cost :param lodgement_date: The date the EPC was lodged :return: """ closest_date = self.retail_price_comparison.iloc[ (self.retail_price_comparison['Date'] - lodgement_date).abs().argsort()[:1] ]['Date'].values[0] closest_date = pd.Timestamp(closest_date) # Extract the tariff price on the closest date tariff_2024 = self.retail_price_comparison[ self.retail_price_comparison['Date'] == closest_date ]['Average standard variable tariff (Large legacy suppliers)'].values[0] # Extract the latest available tariff price latest_tariff = self.retail_price_comparison[ 'Average standard variable tariff (Large legacy suppliers)' ].iloc[-1] # Calculate the ratio ratio = float(latest_tariff) / float(tariff_2024) # Calculate the updated heating cost updated_cost = original_cost * ratio return updated_cost