adding new kwh etl process to backend

2026-08-02 21:08:24 +00:00 · 2024-08-09 11:14:48 +01:00 · 2024-08-09 11:14:48 +01:00 · fffb179219
commit fffb179219
parent 73be979c29
1 changed files with 5 additions and 117 deletions
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -43,6 +43,7 @@ from utils.s3 import read_dataframe_from_s3_parquet, read_csv_from_s3
 from backend.ml_models.Valuation import PropertyValuation

 from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
+from etl.bill_savings.KwhData import KwhData
 from etl.spatial.OpenUprnClient import OpenUprnClient
 from etl.solar.SolarPhotoSupply import SolarPhotoSupply

@ -432,123 +433,10 @@ async def trigger_plan(body: PlanTriggerRequest):

        model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at)

-        epcs_for_scoring = energy_consumption_client.prepare_new_data(input_properties)
-
-        # prepare the data
-
-        # TODO - this needs to be moved to the etl process
-        import numpy as np
-        def add_features_from_code(df):
-
-            FEATURES = {
-                "heating_kwh": [
-                    "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
-                    "heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms",
-                    "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description",
-                    "property-type",
-                    "built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff",
-                    "walls-energy-eff",
-                    "roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description",
-                    "flat-top-storey",
-                    "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag",
-                    "mechanical-ventilation",
-                    "low-energy-lighting", "environment-impact-current", "energy-tariff",
-                    "county", "construction-age-band", "co2-emissions-current",
-                ],
-                "hot_water_kwh": [
-                    "lodgement-year", "lodgement-month",
-                    "current-energy-efficiency",
-                    "energy-consumption-current",
-                    "hot-water-cost-current",
-                    "total-floor-area", "number-heated-rooms",
-                    "hotwater-description", "hot-water-energy-eff", "main-fuel", "property-type", "built-form",
-                    "co2-emissions-current",
-                ]
-            }
-            CATEGORICAL_COLUMNS = [
-                "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
-                "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
-                "built-form",
-                "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
-                "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
-                "county",
-                "windows-description", "windows-energy-eff", "flat-top-storey",
-                "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
-                "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
-            ]
-
-            NUMERICAL_COLUMNS = list({
-                x for x in FEATURES["heating_kwh"] + FEATURES["hot_water_kwh"]
-                if x not in CATEGORICAL_COLUMNS
-            })
-
-            """Performs feature engineering on the dataset."""
-            df["lodgement-date"] = pd.to_datetime(df["lodgement-date"])
-            df["lodgement-year"] = df["lodgement-date"].dt.year
-            df["lodgement-month"] = df["lodgement-date"].dt.month
-
-            # For walls, roof, floor description where we have average thermal transmittance, to avoid too many
-            # categories
-            # we group them
-            ranges = {
-                "lessthan 0.1": (0, 0.1),
-                "0.1 - 0.3": (0.1, 0.3),
-                "0.3 - 0.5": (0.3, 0.5),
-                "morethan 0.5": (0.5, 2.5),
-            }
-
-            # Generate the lookup table
-            thermal_transmittance_lookup_table = []
-            for i in range(1, 251):
-                value = i / 100
-                for label, (low, high) in ranges.items():
-                    if low < value <= high:
-                        thermal_transmittance_lookup_table.append({"from": value, "to": label})
-                        break
-
-            # Convert to DataFrame for display
-            thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
-            thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
-
-            # Apply the lookup table to the data
-            for feature in ["walls-description", "roof-description", "floor-description"]:
-                cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
-                # Round to 2 decimal places and convert to string
-                cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
-
-                df = df.merge(
-                    cleaned_df,
-                    how="left",
-                    left_on=feature,
-                    right_on="original_description",
-                )
-                # We now have the thermal transmittance in the data, which we can use to group with the lookup table
-                df = df.merge(
-                    thermal_transmittance_lookup_table,
-                    how="left",
-                    left_on="thermal_transmittance",
-                    right_on="from",
-                )
-                # Where "to" is populated, replace feature with to
-                df[feature] = np.where(
-                    ~pd.isnull(df["to"]),
-                    df["to"],
-                    df[feature]
-                )
-                df = df.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
-
-            # Convert data types
-            df[NUMERICAL_COLUMNS] = df[NUMERICAL_COLUMNS].apply(pd.to_numeric)
-            df[CATEGORICAL_COLUMNS] = df[CATEGORICAL_COLUMNS].astype(str)
-
-            return df
-
-        def add_estimate_annual_kwh(df):
-            df['estimate_annual_kwh'] = df['energy-consumption-current'] * df['total-floor-area']
-            return df
-
-        epcs_for_scoring = add_features_from_code(epcs_for_scoring)
-        epcs_for_scoring = add_estimate_annual_kwh(epcs_for_scoring)
+        epcs_for_scoring = KwhData.transform(
+            data=pd.DataFrame([p.epc_record.original_epc for p in input_properties]),
+            cleaned=cleaned,
+        )

        kwh_predictions = model_api.predict_all(
            df=epcs_for_scoring,