created transform method

This commit is contained in:
Khalim Conn-Kowlessar 2024-08-09 11:07:16 +01:00
parent 3002a2c740
commit 73be979c29
5 changed files with 113 additions and 3 deletions

View file

@ -586,7 +586,7 @@ class EnergyConsumptionModel:
def estimate_new_consumption(self, current_energy_efficiency, target_efficiency, current_consumption):
"""
Given then consumption_averages dataset, which is produced as a result of the data_combining.py script,
Given then consumption_averages dataset, which is produced as a result of the training_data.py script,
for the energy kwh models, this function will estimate the new consumption based on the current consumption,
based on the expected reduction in consumption from the current rating to the target rating.
:param current_energy_efficiency:

View file

@ -1,5 +1,6 @@
import re
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
from utils.logger import setup_logger
@ -11,6 +12,23 @@ logger = setup_logger()
class KwhData:
COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
CATEGORICAL_COLUMNS = [
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
"built-form",
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"county",
"windows-description", "windows-energy-eff", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
]
NUMERICAL_COLUMNS = [
'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
]
def __init__(self, bucket):
self.run_date = datetime.now().strftime("%Y-%m-%d")
self.bucket = bucket
@ -18,6 +36,7 @@ class KwhData:
self.consumption_data_filepath = None
self.consumption_averages_filepath = None
self.model_training_data_filepath = None
@staticmethod
def extract_kwh_value(text: str):
@ -116,3 +135,84 @@ class KwhData:
)
self.data = df
def transform(
self, data: pd.DataFrame, cleaned, new=False, save=False
):
"""
Given the input EPCs, this method will transform the data into a format that can be used by the model
This method can be used to transform the training data, or new epcs within the backend engine
:return:
"""
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
# in anticipation of the new model
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
data["lodgement-year"] = data["lodgement-date"].dt.year
data["lodgement-month"] = data["lodgement-date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
# categories
# we group them
ranges = {
"lessthan 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"morethan 0.5": (0.5, 2.5),
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
data = data.merge(
cleaned_df,
how="left",
left_on=feature,
right_on="original_description",
)
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
data = data.merge(
thermal_transmittance_lookup_table,
how="left",
left_on="thermal_transmittance",
right_on="from",
)
# Where "to" is populated, replace feature with to
data[feature] = np.where(
~pd.isnull(data["to"]),
data["to"],
data[feature]
)
data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
# Create new features:
data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
if save:
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
save_dataframe_to_s3_parquet(
bucket_name=self.bucket,
file_key=self.model_training_data_filepath,
df=data
)

View file

@ -7,7 +7,7 @@ import inspect
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from etl.epc.settings import EARLIEST_EPC_DATE
from training_data.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
import numpy as np
from utils.s3 import save_pickle_to_s3

View file

@ -1,7 +1,7 @@
from pprint import pprint
import msgpack
from utils.s3 import read_from_s3
from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
def handler():

View file

@ -1,4 +1,6 @@
import msgpack
from etl.bill_savings.KwhData import KwhData
from utils.s3 import read_from_s3
def app():
@ -8,5 +10,13 @@ def app():
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
kwh_data_client = KwhData(bucket="retrofit-datalake-dev")
kwh_data_client.combine()
kwh_data_client.transform(data=kwh_data_client.data, cleaned=cleaned, save=True)