Model/etl/bill_savings/KwhData.py
Khalim Conn-Kowlessar 84d4263d9a removing data
2026-03-18 19:17:22 +00:00

370 lines
15 KiB
Python

import re
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
from utils.logger import setup_logger
from utils.s3 import (
list_files_in_s3_folder, read_pickle_from_s3, save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet,
read_csv_from_s3
)
from backend.Property import Property
logger = setup_logger()
class KwhData:
COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
CATEGORICAL_COLUMNS = [
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
"built-form",
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"county",
"windows-description", "windows-energy-eff", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
"floor-level"
]
NUMERICAL_COLUMNS = [
'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
]
def __init__(self, bucket=None, read_consumption_data=False):
self.run_date = datetime.now().strftime("%Y-%m-%d")
self.bucket = bucket
self.data = None
self.consumption_data_filepath = None
self.consumption_averages_filepath = None
self.model_training_data_filepath = None
self.consumption_averages = None
self.retail_price_comparison = None
if read_consumption_data:
self.get_consumption_data()
self.read_retail_price_comparison()
def get_consumption_data(self):
# Look for the latest version of this file
s3_contents = list_files_in_s3_folder(bucket_name=self.bucket, folder_name="energy_consumption/")
consumption_averages = [
{"run_date": pd.to_datetime(x.split("/")[1]), "filepath": x}
for x in s3_contents if "consumption_averages.parquet" in x
]
# Get the file with the soonest run date
consumption_averages = sorted(consumption_averages, key=lambda x: x["run_date"])
if not consumption_averages:
raise ValueError("No consumption averages data found, something went wrong")
self.consumption_averages = read_dataframe_from_s3_parquet(
bucket_name=self.bucket,
file_key=consumption_averages[-1]["filepath"]
)
def read_retail_price_comparison(self):
data = read_csv_from_s3(
bucket_name=self.bucket,
filepath="energy_consumption/retail-price-comparison.csv"
)
header = ['Date', 'Average standard variable tariff (Large legacy suppliers)',
'Average standard variable tariff (Other suppliers)', 'Average fixed tariff',
'Cheapest tariff (Large legacy suppliers)', 'Cheapest tariff (All suppliers)',
'Cheapest tariff (Basket)', 'Default tariff cap level']
# Extract data rows
data_rows = []
for row in data[1:]:
date = row['\ufeff"']
values = row[None]
data_rows.append([date] + values)
self.retail_price_comparison = pd.DataFrame(data_rows, columns=header)
self.retail_price_comparison['Date'] = pd.to_datetime(self.retail_price_comparison['Date'], errors='coerce')
@staticmethod
def extract_kwh_value(text: str):
"""
Extract the numerical kWh value from a given string.
:param text: The input string containing the kWh value.
:return: The extracted numerical kWh value as an integer.
"""
# Use regular expression to find the numerical value followed by "kWh per year"
match = re.search(r'([\d,]+) kWh per year', text)
if match:
# Remove commas from the extracted value and convert to integer
kwh_value = int(match.group(1).replace(',', ''))
return kwh_value
else:
# If no match is found, return None or raise an exception
return None
def combine(self):
"""
Given the data that is collected containing the kwh values for heating and hot water, this method will combine
and save the data
:return:
"""
# Firstly, list all of the saved files in s3
data_files = list_files_in_s3_folder(bucket_name="retrofit-datalake-dev", folder_name="energy_consumption_data")
complete_data = []
for files in tqdm(data_files):
dataset_run_date = files.split("/")[-1].split(".")[0]
# Extract the date from the file name
dataset_run_date = pd.Timestamp(dataset_run_date)
# Load the data from the file
data = read_pickle_from_s3(bucket_name="retrofit-datalake-dev", s3_file_name=files)
# We check that the retrieved energy consumption sufficiently matches the EPC data
internal_dataset = []
for x in data:
epc_data = x["epc"]
epc_sap = epc_data["current-energy-efficiency"]
epc_potential_sap = epc_data["potential-energy-efficiency"]
# Make sure this matches the extracted sap
if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
x["potential_epc_efficiency"]
):
continue
heating_kwh = self.extract_kwh_value(x["heating_text"])
hot_water_kwh = self.extract_kwh_value(x["hot_water_text"])
internal_dataset.append(
{
**epc_data,
"heating_kwh": heating_kwh,
"hot_water_kwh": hot_water_kwh,
"dataset_run_date": dataset_run_date
}
)
complete_data.extend(internal_dataset)
df = pd.DataFrame(complete_data)
# Because we collate multiple runs into a single data source, it's possible that we have duplicated data at
# the uprn level, so we dedupe based on the newest dataset_run_date
df = df.sort_values("dataset_run_date", ascending=False).drop_duplicates(subset="uprn", keep="first")
df = df.drop(columns=["dataset_run_date"])
for col in self.COLS_TO_STRINGIFY:
df[col] = df[col].astype(str)
# Save the data back to s3, but this time as a parquet file
self.consumption_data_filepath = f"energy_consumption/{self.run_date}/energy_consumption_dataset.parquet"
logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
save_dataframe_to_s3_parquet(
bucket_name=self.bucket,
file_key=self.consumption_data_filepath,
df=df
)
# We also estimate the energy consumption reduction from this data, by band
df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
df = df.drop(columns=["total_consumption"])
self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
logger.info(f"Storing consumption averages in s3 at {self.consumption_averages_filepath}")
# Save the consumption averages back to s3
save_dataframe_to_s3_parquet(
bucket_name="retrofit-data-dev",
file_key=self.consumption_averages_filepath,
df=consumption_averages
)
self.data = df
def transform(
self, data: pd.DataFrame, cleaned, new=False, save=False
):
"""
Given the input EPCs, this method will transform the data into a format that can be used by the model
This method can be used to transform the training data, or new epcs within the backend engine
:return:
"""
if save and self.bucket is None:
raise Exception("bucket not set, cannot save data")
if data.empty:
# If we have no data
return data
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
# in anticipation of the new model
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
data["lodgement-year"] = data["lodgement-date"].dt.year
data["lodgement-month"] = data["lodgement-date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
# categories
# we group them
ranges = {
"lessthan 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"morethan 0.5": (0.5, 2.5),
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
data = data.merge(
cleaned_df,
how="left",
left_on=feature,
right_on="original_description",
)
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
data = data.merge(
thermal_transmittance_lookup_table,
how="left",
left_on="thermal_transmittance",
right_on="from",
)
# Where "to" is populated, replace feature with to
data[feature] = np.where(
~pd.isnull(data["to"]),
data["to"],
data[feature]
)
data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
# Create new features:
data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
# Ensure this is string, because we could have mixed types
data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)
if save:
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
save_dataframe_to_s3_parquet(
bucket_name=self.bucket,
file_key=self.model_training_data_filepath,
df=data
)
return
return data
@staticmethod
def _prepare_epc(p: Property):
"""
Given an instance of the property class, this method will ensure that the EPC is ready for scoring with the
kwh models. In the backend, we perform some cleaning and transformation on an EPC so we just ensure that the
data is in the format required by the model
:return:
"""
epc = p.epc_record.to_dict(case="kebab", source="prepared")
numeric_cols = [
'current-energy-efficiency',
'potential-energy-efficiency', 'environment-impact-current',
'environment-impact-potential', 'energy-consumption-current',
'energy-consumption-potential', 'co2-emissions-current',
'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
'lighting-cost-current', 'lighting-cost-potential',
'heating-cost-current', 'heating-cost-potential',
'hot-water-cost-current', 'hot-water-cost-potential',
'total-floor-area', 'multi-glaze-proportion',
'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
'low-energy-lighting', 'number-open-fireplaces',
'wind-turbine-count', 'unheated-corridor-length',
'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
'low-energy-fixed-light-count',
]
for v in numeric_cols:
if epc[v] is not None:
epc[v] = float(epc[v])
bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
bool_map = {
True: "Y",
False: "N",
None: "N",
"Y": "Y",
"N": "N",
}
for v in bools_to_remap:
epc[v] = bool_map[epc[v]]
no_data = {
"floor-level": "NODATA!",
"floor-energy-eff": "NO DATA!"
}
for v, fill_val in no_data.items():
if pd.isnull(epc[v]):
epc[v] = fill_val
return epc
def prepare_epc(self, input_properties: list[Property]):
scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month
scoring_data["id"] = scoring_data["uprn"].copy()
return scoring_data
def convert_cost_to_today(self, original_cost, lodgement_date):
"""
Given energy costs in an EPC, this function converts that energy cost to a figure based on today's energy costs
(or as close to today as possible)
:param original_cost: The original energy cost
:param lodgement_date: The date the EPC was lodged
:return:
"""
closest_date = self.retail_price_comparison.iloc[
(self.retail_price_comparison['Date'] - lodgement_date).abs().argsort()[:1]
]['Date'].values[0]
closest_date = pd.Timestamp(closest_date)
# Extract the tariff price on the closest date
tariff_2024 = self.retail_price_comparison[
self.retail_price_comparison['Date'] == closest_date
]['Average standard variable tariff (Large legacy suppliers)'].values[0]
# Extract the latest available tariff price
latest_tariff = self.retail_price_comparison[
'Average standard variable tariff (Large legacy suppliers)'
].iloc[-1]
# Calculate the ratio
ratio = float(latest_tariff) / float(tariff_2024)
# Calculate the updated heating cost
updated_cost = original_cost * ratio
return updated_cost