From fffb1792190c11f2d83a3f65984f4d7154711d9d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 9 Aug 2024 11:14:48 +0100 Subject: [PATCH] adding new kwh etl process to backend --- backend/app/plan/router.py | 122 ++----------------------------------- 1 file changed, 5 insertions(+), 117 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 9562af86..925bb725 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -43,6 +43,7 @@ from utils.s3 import read_dataframe_from_s3_parquet, read_csv_from_s3 from backend.ml_models.Valuation import PropertyValuation from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel +from etl.bill_savings.KwhData import KwhData from etl.spatial.OpenUprnClient import OpenUprnClient from etl.solar.SolarPhotoSupply import SolarPhotoSupply @@ -432,123 +433,10 @@ async def trigger_plan(body: PlanTriggerRequest): model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at) - epcs_for_scoring = energy_consumption_client.prepare_new_data(input_properties) - - # prepare the data - - # TODO - this needs to be moved to the etl process - import numpy as np - def add_features_from_code(df): - - FEATURES = { - "heating_kwh": [ - "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", - "heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms", - "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", - "property-type", - "built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff", - "walls-energy-eff", - "roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", - "flat-top-storey", - "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", - "mechanical-ventilation", - "low-energy-lighting", "environment-impact-current", "energy-tariff", - "county", "construction-age-band", "co2-emissions-current", - ], - "hot_water_kwh": [ - "lodgement-year", "lodgement-month", - "current-energy-efficiency", - "energy-consumption-current", - "hot-water-cost-current", - "total-floor-area", "number-heated-rooms", - "hotwater-description", "hot-water-energy-eff", "main-fuel", "property-type", "built-form", - "co2-emissions-current", - ] - } - CATEGORICAL_COLUMNS = [ - "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", - "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", - "built-form", - "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff", - "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description", - "county", - "windows-description", "windows-energy-eff", "flat-top-storey", - "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", - "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating" - ] - - NUMERICAL_COLUMNS = list({ - x for x in FEATURES["heating_kwh"] + FEATURES["hot_water_kwh"] - if x not in CATEGORICAL_COLUMNS - }) - - """Performs feature engineering on the dataset.""" - df["lodgement-date"] = pd.to_datetime(df["lodgement-date"]) - df["lodgement-year"] = df["lodgement-date"].dt.year - df["lodgement-month"] = df["lodgement-date"].dt.month - - # For walls, roof, floor description where we have average thermal transmittance, to avoid too many - # categories - # we group them - ranges = { - "lessthan 0.1": (0, 0.1), - "0.1 - 0.3": (0.1, 0.3), - "0.3 - 0.5": (0.3, 0.5), - "morethan 0.5": (0.5, 2.5), - } - - # Generate the lookup table - thermal_transmittance_lookup_table = [] - for i in range(1, 251): - value = i / 100 - for label, (low, high) in ranges.items(): - if low < value <= high: - thermal_transmittance_lookup_table.append({"from": value, "to": label}) - break - - # Convert to DataFrame for display - thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table) - thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) - - # Apply the lookup table to the data - for feature in ["walls-description", "roof-description", "floor-description"]: - cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]] - # Round to 2 decimal places and convert to string - cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) - - df = df.merge( - cleaned_df, - how="left", - left_on=feature, - right_on="original_description", - ) - # We now have the thermal transmittance in the data, which we can use to group with the lookup table - df = df.merge( - thermal_transmittance_lookup_table, - how="left", - left_on="thermal_transmittance", - right_on="from", - ) - # Where "to" is populated, replace feature with to - df[feature] = np.where( - ~pd.isnull(df["to"]), - df["to"], - df[feature] - ) - df = df.drop(columns=["original_description", "thermal_transmittance", "from", "to"]) - - # Convert data types - df[NUMERICAL_COLUMNS] = df[NUMERICAL_COLUMNS].apply(pd.to_numeric) - df[CATEGORICAL_COLUMNS] = df[CATEGORICAL_COLUMNS].astype(str) - - return df - - def add_estimate_annual_kwh(df): - df['estimate_annual_kwh'] = df['energy-consumption-current'] * df['total-floor-area'] - return df - - epcs_for_scoring = add_features_from_code(epcs_for_scoring) - epcs_for_scoring = add_estimate_annual_kwh(epcs_for_scoring) + epcs_for_scoring = KwhData.transform( + data=pd.DataFrame([p.epc_record.original_epc for p in input_properties]), + cleaned=cleaned, + ) kwh_predictions = model_api.predict_all( df=epcs_for_scoring,