integrating rebaselining

2026-07-27 23:35:01 +00:00 · 2026-03-24 22:56:53 +00:00 · 2026-03-24 22:56:53 +00:00 · 28b39407d0
commit 28b39407d0
parent b22c7ac6e8
1 changed files with 117 additions and 46 deletions
--- a/backend/engine/engine.py
+++ b/backend/engine/engine.py
@ -2,18 +2,17 @@ import time
 import json
 from copy import deepcopy
 from datetime import datetime
-from typing import Dict
-
-from tqdm import tqdm
 import pandas as pd
 import numpy as np
 from uuid import UUID

+from tqdm import tqdm
+from sqlalchemy.exc import IntegrityError, OperationalError
+from starlette.responses import Response
+
 from backend.SearchEpc import SearchEpc

 from etl.epc.Record import EPCRecord
-from sqlalchemy.exc import IntegrityError, OperationalError
-from starlette.responses import Response
 from backend.app.BatterySapScorer import BatterySAPScorer

 from backend.app.config import get_settings, get_prediction_buckets
@ -122,14 +121,25 @@ def extract_portfolio_aggregation_data(
        cost = sum([r["total"] for r in default_recommendations])
        sap_point_improvement = sum([r["sap_points"] for r in default_recommendations])

-        if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]):
+        # Fix ambiguous Series/DataFrame truth value for current_value
+        current_value = property_value_increase_ranges[p.id]["current_value"]
+        if isinstance(current_value, (pd.Series, pd.DataFrame)):
+            # Reduce to scalar
+            is_null = bool(
+                current_value.isnull().all().item() if
+                hasattr(current_value.isnull().all(), 'item') else current_value.isnull().all().all()
+            )
+        else:
+            is_null = bool(pd.isnull(current_value))
+
+        if (not is_null) and (current_value is not None):
            lower_bound_valuation_uplift = (
                property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
-                property_value_increase_ranges[p.id]["current_value"]
+                current_value
            )
            upper_bound_valuation_uplift = (
                property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
-                property_value_increase_ranges[p.id]["current_value"]
+                current_value
            )
        else:
            lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0
@ -419,11 +429,14 @@ def extract_address_data(config, body):
    Simple helper to grab address data from the config
    :return:
    """
-    uprn = config.get("uprn", None)
-    if pd.isnull(uprn):
+    try:
+        uprn = config.get("uprn", None)
+        if uprn is not None and pd.notnull(uprn):
+            uprn = int(float(uprn))
+        else:
+            uprn = None
+    except Exception:
        uprn = None
-    if uprn:
-        uprn = int(float(uprn))

    address1 = config.get("address", None)
    # Handle domna address list format
@ -706,8 +719,8 @@ async def model_engine(body: PlanTriggerRequest):
            # Otherwise, we use the newest EPC
            # energy_assessment_is_newer will tell us if the energy assessment is newer than the newest EPC that
            # has been publically lodged
-            epc_records, energy_assessment["energy_assessment_is_newer"] = create_epc_records(
-                epc_searcher, energy_assessment
+            epc_records, energy_assessment_is_newer = create_epc_records(
+                epc_searcher, energy_assessment if energy_assessment is not None else {"epc": None}
            )

            req_data = extract_property_request_data(
@ -806,6 +819,7 @@ async def model_engine(body: PlanTriggerRequest):

        # Rebaselining
        # TODO: MUST happen before setting features
+        logger.info("Preparing rebaselining")
        rebaselining_scoring_data = []
        for p in tqdm(input_properties):
            # 1) EPC expired 2) Missing EPC 3) Different information from landlord vs EPC
@ -817,6 +831,8 @@ async def model_engine(body: PlanTriggerRequest):
                rebaselining_scoring_data.append(scoring_data)

        rebaselining_scoring_data = pd.concat(rebaselining_scoring_data)
+        if not rebaselining_scoring_data.empty:
+            logger.info(f"{rebaselining_scoring_data.shape[0]} properties require re-baselineing")

        # Trigger re-scoring
        rebaselining_scoring_data["is_post_sap10_starting"] = True
@ -829,46 +845,100 @@ async def model_engine(body: PlanTriggerRequest):
            extract_uprn=True
        )

-        # TODO - Pull out predictions!!!
-
-        # TODO: TEMP: Compare values
+        # TODO: TEMP: Compare values - and summarise the differences
        compare_scores = []
+
        for x in rebaselining_scoring_data["uprn"].unique():
            record = [p for p in input_properties if p.uprn == x][0].epc_record
+
            original_sap = record.current_energy_efficiency
-            new_sap = rebaselining_response["retrofit-sap-baseline-predictions"][
-                rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == x
+            new_sap = rebaselining_response["retrofit_sap_baseline_predictions"][
+                rebaselining_response["retrofit_sap_baseline_predictions"]["uprn"] == x
                ]["predictions"].values[0]
+
            lodgement_date = record.lodgement_date
-            compare_scores.append({
+            ll_differences = record.landlord_differences
+
+            # 🔑 Normalise original keys to match LL format
+            original = {
+                k.replace("-", "_"): v
+                for k, v in record.original_epc.items()
+                if k.replace("-", "_") in ll_differences
+            }
+
+            row = {
                "uprn": x,
                "original_sap": original_sap,
                "new_sap": new_sap,
-                "lodgement_date": lodgement_date
-            })
-        compare_scores = pd.DataFrame(compare_scores)
-
-        for uprn in rebaselining_scoring_data["uprn"].unique():
-            # Get the predictions
-            models = [
-                "retrofit-sap-baseline-predictions",
-                "retrofit-carbon-baseline-predictions",
-                "retrofit-heat-baseline-predictions",
-            ]
-            property_prediction: Dict[str, float] = {
-                model: rebaselining_response[model][
-                    rebaselining_response[model]["uprn"] == uprn
-                    ]["predictions"].values[0] for model in models
+                "differences": ll_differences,
+                "lodgement_date": lodgement_date,
            }

-            # We now need to insert the new values into the epc_record
-            property_instance = next(p for p in input_properties if p.uprn == int(uprn))
-            property_instance.epc_record.insert_new_performance_values(
-                new_sap=property_prediction["retrofit-sap-baseline-predictions"],
-                new_epc=sap_to_epc(property_prediction["retrofit-sap-baseline-predictions"]),
-                new_carbon=property_prediction["retrofit-carbon-baseline-predictions"],
-                new_heat_demand=property_prediction["retrofit-heat-baseline-predictions"],
-            )
+            # 🔑 Add paired columns in order
+            for key in ll_differences.keys():
+                row[f"{key}_ori"] = original.get(key)
+                row[f"{key}_ll"] = ll_differences.get(key)
+
+            compare_scores.append(row)
+
+        compare_scores = pd.DataFrame(compare_scores)
+        df = compare_scores.copy()
+
+        ori_cols = [c for c in df.columns if c.endswith("_ori")]
+
+        for ori_col in ori_cols:
+            ll_col = ori_col.replace("_ori", "_ll")
+
+            if ll_col in df.columns:
+                # Handle NaNs properly
+                same = (
+                    df[ori_col].fillna("NULL")
+                    == df[ll_col].fillna("NULL")
+                )
+
+                df.loc[same, [ori_col, ll_col]] = None
+
+        # --- Refactored: Efficiently update EPC records with new model predictions ---
+        # Pre-index input_properties by UPRN for fast lookup
+        input_properties_by_uprn = {int(p.uprn): p for p in input_properties if p.uprn is not None}
+
+        # Pre-index predictions for each model by UPRN
+        model_names = [
+            "retrofit_sap_baseline_predictions",
+            "retrofit_carbon_baseline_predictions",
+            "retrofit_heat_baseline_predictions",
+        ]
+        predictions_by_model_and_uprn = {}
+        for model in model_names:
+            df = rebaselining_response[model]
+            predictions_by_model_and_uprn[model] = dict(zip(df["uprn"].astype(int), df["predictions"]))
+
+        for uprn in rebaselining_scoring_data["uprn"].unique():
+            try:
+                uprn_int = int(uprn)
+                property_instance = input_properties_by_uprn.get(uprn_int)
+                if property_instance is None:
+                    logger.warning(f"No property found for UPRN {uprn_int} during rebaselining update.")
+                    continue
+                # Gather predictions for this UPRN
+                try:
+                    new_sap = predictions_by_model_and_uprn["retrofit_sap_baseline_predictions"][uprn_int]
+                    new_carbon = predictions_by_model_and_uprn["retrofit_carbon_baseline_predictions"][uprn_int]
+                    new_heat_demand = predictions_by_model_and_uprn["retrofit_heat_baseline_predictions"][uprn_int]
+                except KeyError as e:
+                    logger.warning(f"Missing prediction for UPRN {uprn_int}: {e}")
+                    continue
+                # Update EPC record
+                property_instance.epc_record.insert_new_performance_values(
+                    new_sap=new_sap,
+                    new_epc=sap_to_epc(new_sap),
+                    new_carbon=new_carbon,
+                    new_heat_demand=new_heat_demand,
+                )
+                logger.info(f"Updated EPC record for UPRN {uprn_int} with new model predictions.")
+            except Exception as e:
+                logger.error(f"Error updating EPC record for UPRN {uprn}: {e}")
+        # --- End refactor ---

        kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True)

@ -1135,8 +1205,9 @@ async def model_engine(body: PlanTriggerRequest):
                # We optimise and then we determine eligibility for funding, based on the measures selected
                optimiser = (
                    GainOptimiser(
-                        input_measures, max_cost=body.budget, max_gain=gain, allow_slack=False
-                    ) if body.budget else CostOptimiser(input_measures, min_gain=gain)
+                        input_measures, max_cost=body.budget, max_gain=float(gain) if gain is not None else 0,
+                        allow_slack=False
+                    ) if body.budget else CostOptimiser(input_measures, min_gain=float(gain) if gain is not None else 0)
                )
                optimiser.setup()
                optimiser.solve()
@ -1149,7 +1220,7 @@ async def model_engine(body: PlanTriggerRequest):
                )
                battery_sap_score = BatterySAPScorer.score(starting_sap=post_sap, pv_size=pv_size)

-            # We add the defauly already installed measures to the solution
+            # We add the defaulty already installed measures to the solution
            selected = {r["id"] for r in solution + default_already_installed}

            if property_required_measures: