mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
created transform method
This commit is contained in:
parent
3002a2c740
commit
73be979c29
5 changed files with 113 additions and 3 deletions
|
|
@ -586,7 +586,7 @@ class EnergyConsumptionModel:
|
|||
|
||||
def estimate_new_consumption(self, current_energy_efficiency, target_efficiency, current_consumption):
|
||||
"""
|
||||
Given then consumption_averages dataset, which is produced as a result of the data_combining.py script,
|
||||
Given then consumption_averages dataset, which is produced as a result of the training_data.py script,
|
||||
for the energy kwh models, this function will estimate the new consumption based on the current consumption,
|
||||
based on the expected reduction in consumption from the current rating to the target rating.
|
||||
:param current_energy_efficiency:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
from utils.logger import setup_logger
|
||||
|
|
@ -11,6 +12,23 @@ logger = setup_logger()
|
|||
class KwhData:
|
||||
COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
|
||||
|
||||
CATEGORICAL_COLUMNS = [
|
||||
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
|
||||
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
|
||||
"built-form",
|
||||
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
|
||||
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
|
||||
"county",
|
||||
"windows-description", "windows-energy-eff", "flat-top-storey",
|
||||
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
|
||||
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
|
||||
]
|
||||
|
||||
NUMERICAL_COLUMNS = [
|
||||
'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
|
||||
'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
|
||||
]
|
||||
|
||||
def __init__(self, bucket):
|
||||
self.run_date = datetime.now().strftime("%Y-%m-%d")
|
||||
self.bucket = bucket
|
||||
|
|
@ -18,6 +36,7 @@ class KwhData:
|
|||
|
||||
self.consumption_data_filepath = None
|
||||
self.consumption_averages_filepath = None
|
||||
self.model_training_data_filepath = None
|
||||
|
||||
@staticmethod
|
||||
def extract_kwh_value(text: str):
|
||||
|
|
@ -116,3 +135,84 @@ class KwhData:
|
|||
)
|
||||
|
||||
self.data = df
|
||||
|
||||
def transform(
|
||||
self, data: pd.DataFrame, cleaned, new=False, save=False
|
||||
):
|
||||
"""
|
||||
Given the input EPCs, this method will transform the data into a format that can be used by the model
|
||||
This method can be used to transform the training data, or new epcs within the backend engine
|
||||
:return:
|
||||
"""
|
||||
|
||||
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
|
||||
# in anticipation of the new model
|
||||
|
||||
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
|
||||
data["lodgement-year"] = data["lodgement-date"].dt.year
|
||||
data["lodgement-month"] = data["lodgement-date"].dt.month
|
||||
|
||||
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
|
||||
# categories
|
||||
# we group them
|
||||
ranges = {
|
||||
"lessthan 0.1": (0, 0.1),
|
||||
"0.1 - 0.3": (0.1, 0.3),
|
||||
"0.3 - 0.5": (0.3, 0.5),
|
||||
"morethan 0.5": (0.5, 2.5),
|
||||
}
|
||||
|
||||
# Generate the lookup table
|
||||
thermal_transmittance_lookup_table = []
|
||||
for i in range(1, 251):
|
||||
value = i / 100
|
||||
for label, (low, high) in ranges.items():
|
||||
if low < value <= high:
|
||||
thermal_transmittance_lookup_table.append({"from": value, "to": label})
|
||||
break
|
||||
|
||||
# Convert to DataFrame for display
|
||||
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
|
||||
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
|
||||
|
||||
# Apply the lookup table to the data
|
||||
for feature in ["walls-description", "roof-description", "floor-description"]:
|
||||
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
|
||||
# Round to 2 decimal places and convert to string
|
||||
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
|
||||
|
||||
data = data.merge(
|
||||
cleaned_df,
|
||||
how="left",
|
||||
left_on=feature,
|
||||
right_on="original_description",
|
||||
)
|
||||
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
|
||||
data = data.merge(
|
||||
thermal_transmittance_lookup_table,
|
||||
how="left",
|
||||
left_on="thermal_transmittance",
|
||||
right_on="from",
|
||||
)
|
||||
# Where "to" is populated, replace feature with to
|
||||
data[feature] = np.where(
|
||||
~pd.isnull(data["to"]),
|
||||
data["to"],
|
||||
data[feature]
|
||||
)
|
||||
data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
|
||||
|
||||
data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
|
||||
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
|
||||
|
||||
# Create new features:
|
||||
data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
|
||||
|
||||
if save:
|
||||
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
|
||||
logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
|
||||
save_dataframe_to_s3_parquet(
|
||||
bucket_name=self.bucket,
|
||||
file_key=self.model_training_data_filepath,
|
||||
df=data
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import inspect
|
|||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from bs4 import BeautifulSoup
|
||||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||
from training_data.epc.settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
from utils.s3 import save_pickle_to_s3
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from pprint import pprint
|
||||
import msgpack
|
||||
from utils.s3 import read_from_s3
|
||||
from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
|
||||
from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
|
||||
|
||||
|
||||
def handler():
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
import msgpack
|
||||
from etl.bill_savings.KwhData import KwhData
|
||||
from utils.s3 import read_from_s3
|
||||
|
||||
|
||||
def app():
|
||||
|
|
@ -8,5 +10,13 @@ def app():
|
|||
:return:
|
||||
"""
|
||||
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
kwh_data_client = KwhData(bucket="retrofit-datalake-dev")
|
||||
kwh_data_client.combine()
|
||||
kwh_data_client.transform(data=kwh_data_client.data, cleaned=cleaned, save=True)
|
||||
Loading…
Add table
Reference in a new issue