adding new kwh etl process to backend

This commit is contained in:
Khalim Conn-Kowlessar 2024-08-09 11:14:48 +01:00
parent 73be979c29
commit fffb179219

View file

@ -43,6 +43,7 @@ from utils.s3 import read_dataframe_from_s3_parquet, read_csv_from_s3
from backend.ml_models.Valuation import PropertyValuation
from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
from etl.bill_savings.KwhData import KwhData
from etl.spatial.OpenUprnClient import OpenUprnClient
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
@ -432,123 +433,10 @@ async def trigger_plan(body: PlanTriggerRequest):
model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at)
epcs_for_scoring = energy_consumption_client.prepare_new_data(input_properties)
# prepare the data
# TODO - this needs to be moved to the etl process
import numpy as np
def add_features_from_code(df):
FEATURES = {
"heating_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
"heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms",
"mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description",
"property-type",
"built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff",
"walls-energy-eff",
"roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description",
"flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag",
"mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff",
"county", "construction-age-band", "co2-emissions-current",
],
"hot_water_kwh": [
"lodgement-year", "lodgement-month",
"current-energy-efficiency",
"energy-consumption-current",
"hot-water-cost-current",
"total-floor-area", "number-heated-rooms",
"hotwater-description", "hot-water-energy-eff", "main-fuel", "property-type", "built-form",
"co2-emissions-current",
]
}
CATEGORICAL_COLUMNS = [
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
"built-form",
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"county",
"windows-description", "windows-energy-eff", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
]
NUMERICAL_COLUMNS = list({
x for x in FEATURES["heating_kwh"] + FEATURES["hot_water_kwh"]
if x not in CATEGORICAL_COLUMNS
})
"""Performs feature engineering on the dataset."""
df["lodgement-date"] = pd.to_datetime(df["lodgement-date"])
df["lodgement-year"] = df["lodgement-date"].dt.year
df["lodgement-month"] = df["lodgement-date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
# categories
# we group them
ranges = {
"lessthan 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"morethan 0.5": (0.5, 2.5),
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
df = df.merge(
cleaned_df,
how="left",
left_on=feature,
right_on="original_description",
)
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
df = df.merge(
thermal_transmittance_lookup_table,
how="left",
left_on="thermal_transmittance",
right_on="from",
)
# Where "to" is populated, replace feature with to
df[feature] = np.where(
~pd.isnull(df["to"]),
df["to"],
df[feature]
)
df = df.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
# Convert data types
df[NUMERICAL_COLUMNS] = df[NUMERICAL_COLUMNS].apply(pd.to_numeric)
df[CATEGORICAL_COLUMNS] = df[CATEGORICAL_COLUMNS].astype(str)
return df
def add_estimate_annual_kwh(df):
df['estimate_annual_kwh'] = df['energy-consumption-current'] * df['total-floor-area']
return df
epcs_for_scoring = add_features_from_code(epcs_for_scoring)
epcs_for_scoring = add_estimate_annual_kwh(epcs_for_scoring)
epcs_for_scoring = KwhData.transform(
data=pd.DataFrame([p.epc_record.original_epc for p in input_properties]),
cleaned=cleaned,
)
kwh_predictions = model_api.predict_all(
df=epcs_for_scoring,