mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
370 lines
15 KiB
Python
370 lines
15 KiB
Python
import re
|
|
import pandas as pd
|
|
import numpy as np
|
|
from datetime import datetime
|
|
from tqdm import tqdm
|
|
from utils.logger import setup_logger
|
|
from utils.s3 import (
|
|
list_files_in_s3_folder, read_pickle_from_s3, save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet,
|
|
read_csv_from_s3
|
|
)
|
|
from backend.Property import Property
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
class KwhData:
|
|
COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
|
|
|
|
CATEGORICAL_COLUMNS = [
|
|
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
|
|
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
|
|
"built-form",
|
|
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
|
|
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
|
|
"county",
|
|
"windows-description", "windows-energy-eff", "flat-top-storey",
|
|
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
|
|
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
|
|
"floor-level"
|
|
]
|
|
|
|
NUMERICAL_COLUMNS = [
|
|
'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
|
|
'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
|
|
]
|
|
|
|
def __init__(self, bucket=None, read_consumption_data=False):
|
|
self.run_date = datetime.now().strftime("%Y-%m-%d")
|
|
self.bucket = bucket
|
|
self.data = None
|
|
|
|
self.consumption_data_filepath = None
|
|
self.consumption_averages_filepath = None
|
|
self.model_training_data_filepath = None
|
|
|
|
self.consumption_averages = None
|
|
self.retail_price_comparison = None
|
|
if read_consumption_data:
|
|
self.get_consumption_data()
|
|
self.read_retail_price_comparison()
|
|
|
|
def get_consumption_data(self):
|
|
|
|
# Look for the latest version of this file
|
|
s3_contents = list_files_in_s3_folder(bucket_name=self.bucket, folder_name="energy_consumption/")
|
|
consumption_averages = [
|
|
{"run_date": pd.to_datetime(x.split("/")[1]), "filepath": x}
|
|
for x in s3_contents if "consumption_averages.parquet" in x
|
|
]
|
|
# Get the file with the soonest run date
|
|
consumption_averages = sorted(consumption_averages, key=lambda x: x["run_date"])
|
|
if not consumption_averages:
|
|
raise ValueError("No consumption averages data found, something went wrong")
|
|
|
|
self.consumption_averages = read_dataframe_from_s3_parquet(
|
|
bucket_name=self.bucket,
|
|
file_key=consumption_averages[-1]["filepath"]
|
|
)
|
|
|
|
def read_retail_price_comparison(self):
|
|
data = read_csv_from_s3(
|
|
bucket_name=self.bucket,
|
|
filepath="energy_consumption/retail-price-comparison.csv"
|
|
)
|
|
header = ['Date', 'Average standard variable tariff (Large legacy suppliers)',
|
|
'Average standard variable tariff (Other suppliers)', 'Average fixed tariff',
|
|
'Cheapest tariff (Large legacy suppliers)', 'Cheapest tariff (All suppliers)',
|
|
'Cheapest tariff (Basket)', 'Default tariff cap level']
|
|
|
|
# Extract data rows
|
|
data_rows = []
|
|
for row in data[1:]:
|
|
date = row['\ufeff"']
|
|
values = row[None]
|
|
data_rows.append([date] + values)
|
|
|
|
self.retail_price_comparison = pd.DataFrame(data_rows, columns=header)
|
|
self.retail_price_comparison['Date'] = pd.to_datetime(self.retail_price_comparison['Date'], errors='coerce')
|
|
|
|
@staticmethod
|
|
def extract_kwh_value(text: str):
|
|
"""
|
|
Extract the numerical kWh value from a given string.
|
|
|
|
:param text: The input string containing the kWh value.
|
|
:return: The extracted numerical kWh value as an integer.
|
|
"""
|
|
# Use regular expression to find the numerical value followed by "kWh per year"
|
|
match = re.search(r'([\d,]+) kWh per year', text)
|
|
|
|
if match:
|
|
# Remove commas from the extracted value and convert to integer
|
|
kwh_value = int(match.group(1).replace(',', ''))
|
|
return kwh_value
|
|
else:
|
|
# If no match is found, return None or raise an exception
|
|
return None
|
|
|
|
def combine(self):
|
|
"""
|
|
Given the data that is collected containing the kwh values for heating and hot water, this method will combine
|
|
and save the data
|
|
:return:
|
|
"""
|
|
|
|
# Firstly, list all of the saved files in s3
|
|
data_files = list_files_in_s3_folder(bucket_name="retrofit-datalake-dev", folder_name="energy_consumption_data")
|
|
|
|
complete_data = []
|
|
for files in tqdm(data_files):
|
|
dataset_run_date = files.split("/")[-1].split(".")[0]
|
|
# Extract the date from the file name
|
|
dataset_run_date = pd.Timestamp(dataset_run_date)
|
|
|
|
# Load the data from the file
|
|
data = read_pickle_from_s3(bucket_name="retrofit-datalake-dev", s3_file_name=files)
|
|
|
|
# We check that the retrieved energy consumption sufficiently matches the EPC data
|
|
internal_dataset = []
|
|
for x in data:
|
|
epc_data = x["epc"]
|
|
epc_sap = epc_data["current-energy-efficiency"]
|
|
epc_potential_sap = epc_data["potential-energy-efficiency"]
|
|
# Make sure this matches the extracted sap
|
|
if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
|
|
x["potential_epc_efficiency"]
|
|
):
|
|
continue
|
|
|
|
heating_kwh = self.extract_kwh_value(x["heating_text"])
|
|
hot_water_kwh = self.extract_kwh_value(x["hot_water_text"])
|
|
internal_dataset.append(
|
|
{
|
|
**epc_data,
|
|
"heating_kwh": heating_kwh,
|
|
"hot_water_kwh": hot_water_kwh,
|
|
"dataset_run_date": dataset_run_date
|
|
}
|
|
)
|
|
|
|
complete_data.extend(internal_dataset)
|
|
|
|
df = pd.DataFrame(complete_data)
|
|
# Because we collate multiple runs into a single data source, it's possible that we have duplicated data at
|
|
# the uprn level, so we dedupe based on the newest dataset_run_date
|
|
|
|
df = df.sort_values("dataset_run_date", ascending=False).drop_duplicates(subset="uprn", keep="first")
|
|
df = df.drop(columns=["dataset_run_date"])
|
|
|
|
for col in self.COLS_TO_STRINGIFY:
|
|
df[col] = df[col].astype(str)
|
|
|
|
# Save the data back to s3, but this time as a parquet file
|
|
self.consumption_data_filepath = f"energy_consumption/{self.run_date}/energy_consumption_dataset.parquet"
|
|
logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
|
|
save_dataframe_to_s3_parquet(
|
|
bucket_name=self.bucket,
|
|
file_key=self.consumption_data_filepath,
|
|
df=df
|
|
)
|
|
|
|
# We also estimate the energy consumption reduction from this data, by band
|
|
df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
|
|
consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
|
|
df = df.drop(columns=["total_consumption"])
|
|
|
|
self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
|
|
logger.info(f"Storing consumption averages in s3 at {self.consumption_averages_filepath}")
|
|
# Save the consumption averages back to s3
|
|
save_dataframe_to_s3_parquet(
|
|
bucket_name="retrofit-data-dev",
|
|
file_key=self.consumption_averages_filepath,
|
|
df=consumption_averages
|
|
)
|
|
|
|
self.data = df
|
|
|
|
def transform(
|
|
self, data: pd.DataFrame, cleaned, new=False, save=False
|
|
):
|
|
"""
|
|
Given the input EPCs, this method will transform the data into a format that can be used by the model
|
|
This method can be used to transform the training data, or new epcs within the backend engine
|
|
:return:
|
|
"""
|
|
if save and self.bucket is None:
|
|
raise Exception("bucket not set, cannot save data")
|
|
|
|
if data.empty:
|
|
# If we have no data
|
|
return data
|
|
|
|
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
|
|
# in anticipation of the new model
|
|
|
|
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
|
|
data["lodgement-year"] = data["lodgement-date"].dt.year
|
|
data["lodgement-month"] = data["lodgement-date"].dt.month
|
|
|
|
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
|
|
# categories
|
|
# we group them
|
|
ranges = {
|
|
"lessthan 0.1": (0, 0.1),
|
|
"0.1 - 0.3": (0.1, 0.3),
|
|
"0.3 - 0.5": (0.3, 0.5),
|
|
"morethan 0.5": (0.5, 2.5),
|
|
}
|
|
|
|
# Generate the lookup table
|
|
thermal_transmittance_lookup_table = []
|
|
for i in range(1, 251):
|
|
value = i / 100
|
|
for label, (low, high) in ranges.items():
|
|
if low < value <= high:
|
|
thermal_transmittance_lookup_table.append({"from": value, "to": label})
|
|
break
|
|
|
|
# Convert to DataFrame for display
|
|
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
|
|
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
|
|
|
|
# Apply the lookup table to the data
|
|
for feature in ["walls-description", "roof-description", "floor-description"]:
|
|
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
|
|
# Round to 2 decimal places and convert to string
|
|
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
|
|
|
|
data = data.merge(
|
|
cleaned_df,
|
|
how="left",
|
|
left_on=feature,
|
|
right_on="original_description",
|
|
)
|
|
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
|
|
data = data.merge(
|
|
thermal_transmittance_lookup_table,
|
|
how="left",
|
|
left_on="thermal_transmittance",
|
|
right_on="from",
|
|
)
|
|
# Where "to" is populated, replace feature with to
|
|
data[feature] = np.where(
|
|
~pd.isnull(data["to"]),
|
|
data["to"],
|
|
data[feature]
|
|
)
|
|
data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
|
|
|
|
data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
|
|
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
|
|
|
|
# Create new features:
|
|
data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
|
|
|
|
# Ensure this is string, because we could have mixed types
|
|
data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)
|
|
|
|
if save:
|
|
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
|
|
logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
|
|
save_dataframe_to_s3_parquet(
|
|
bucket_name=self.bucket,
|
|
file_key=self.model_training_data_filepath,
|
|
df=data
|
|
)
|
|
return
|
|
|
|
return data
|
|
|
|
@staticmethod
|
|
def _prepare_epc(p: Property):
|
|
"""
|
|
Given an instance of the property class, this method will ensure that the EPC is ready for scoring with the
|
|
kwh models. In the backend, we perform some cleaning and transformation on an EPC so we just ensure that the
|
|
data is in the format required by the model
|
|
:return:
|
|
"""
|
|
|
|
epc = p.epc_record.to_dict(case="kebab", source="prepared")
|
|
numeric_cols = [
|
|
'current-energy-efficiency',
|
|
'potential-energy-efficiency', 'environment-impact-current',
|
|
'environment-impact-potential', 'energy-consumption-current',
|
|
'energy-consumption-potential', 'co2-emissions-current',
|
|
'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
|
|
'lighting-cost-current', 'lighting-cost-potential',
|
|
'heating-cost-current', 'heating-cost-potential',
|
|
'hot-water-cost-current', 'hot-water-cost-potential',
|
|
'total-floor-area', 'multi-glaze-proportion',
|
|
'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
|
|
'low-energy-lighting', 'number-open-fireplaces',
|
|
'wind-turbine-count', 'unheated-corridor-length',
|
|
'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
|
|
'low-energy-fixed-light-count',
|
|
]
|
|
for v in numeric_cols:
|
|
if epc[v] is not None:
|
|
epc[v] = float(epc[v])
|
|
|
|
bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
|
|
bool_map = {
|
|
True: "Y",
|
|
False: "N",
|
|
None: "N",
|
|
"Y": "Y",
|
|
"N": "N",
|
|
}
|
|
for v in bools_to_remap:
|
|
epc[v] = bool_map[epc[v]]
|
|
|
|
no_data = {
|
|
"floor-level": "NODATA!",
|
|
"floor-energy-eff": "NO DATA!"
|
|
}
|
|
for v, fill_val in no_data.items():
|
|
if pd.isnull(epc[v]):
|
|
epc[v] = fill_val
|
|
|
|
return epc
|
|
|
|
def prepare_epc(self, input_properties: list[Property]):
|
|
scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
|
|
scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
|
|
scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month
|
|
|
|
scoring_data["id"] = scoring_data["uprn"].copy()
|
|
|
|
return scoring_data
|
|
|
|
def convert_cost_to_today(self, original_cost, lodgement_date):
|
|
"""
|
|
Given energy costs in an EPC, this function converts that energy cost to a figure based on today's energy costs
|
|
(or as close to today as possible)
|
|
:param original_cost: The original energy cost
|
|
:param lodgement_date: The date the EPC was lodged
|
|
:return:
|
|
"""
|
|
closest_date = self.retail_price_comparison.iloc[
|
|
(self.retail_price_comparison['Date'] - lodgement_date).abs().argsort()[:1]
|
|
]['Date'].values[0]
|
|
closest_date = pd.Timestamp(closest_date)
|
|
|
|
# Extract the tariff price on the closest date
|
|
tariff_2024 = self.retail_price_comparison[
|
|
self.retail_price_comparison['Date'] == closest_date
|
|
]['Average standard variable tariff (Large legacy suppliers)'].values[0]
|
|
|
|
# Extract the latest available tariff price
|
|
latest_tariff = self.retail_price_comparison[
|
|
'Average standard variable tariff (Large legacy suppliers)'
|
|
].iloc[-1]
|
|
|
|
# Calculate the ratio
|
|
ratio = float(latest_tariff) / float(tariff_2024)
|
|
|
|
# Calculate the updated heating cost
|
|
updated_cost = original_cost * ratio
|
|
|
|
return updated_cost
|