Fixing boiler upgrade recommendation

2026-07-27 23:35:01 +00:00 · 2025-11-10 20:46:57 +00:00 · 2025-11-10 20:46:57 +00:00 · 4151b58dea
commit 4151b58dea
parent 0f2a064f40
3 changed files with 90 additions and 21 deletions
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -43,18 +43,6 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest):
        logger.error("Failed to parse request body: %s", e)
        return {"message": "Invalid request"}, 400

-    # TODO: Warm up the lambdas here
-    # from backend.ml_models.api import ModelApi
-    # model_api = ModelApi(
-    #     portfolio_id=body.portfolio_id,
-    #     timestamp="2020-01-01T00:00:00",
-    #     prediction_buckets=[],
-    #     max_retries=1
-    # )
-    # await model_api.async_warm_up_lambdas(
-    #     model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES
-    # )
-
    # If file_format is domna_asset_list and type is xlsx, read and chunk it
    if data.get("file_format") == "domna_asset_list" and data.get("file_type") == "xlsx":
        try:
@ -94,6 +82,7 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest):
                data["scenario_id"] = scenario_id

            for i in range(total_chunks):
+                # Create an entry in the request logs table
                index_start = i * chunk_size
                index_end = min((i + 1) * chunk_size, total_rows)

--- a/etl/webscrape/Zoopla.py
+++ b/etl/webscrape/Zoopla.py
@ -6,6 +6,8 @@ import random
 import os
 from multiprocessing import Pool
 from tqdm import tqdm
+import re
+import json

 ENGINES = ["safari", "chrome"]
 CACHE_DIR = "zoopla_cache"
@ -17,13 +19,69 @@ def random_delay():
    time.sleep(random.uniform(0.5, 2))


+def extract_feature(soup, icon_id):
+    tag = soup.find("use", href=f"#{icon_id}")
+    if tag:
+        parent = tag.find_parent("div", class_="_1pbf8i53")
+        if parent:
+            text = parent.get_text(strip=True)
+            return text
+    return None
+
+
+def extract_embedded_json(text):
+    """
+    Extract embedded property JSON containing attributes, energy, estimates, and sales history.
+    """
+    # Try to grab everything after "attributes"
+    match = re.search(
+        r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]',
+        text,
+        re.DOTALL
+    )
+    if match:
+        snippet = "{" + match.group(0) + "}"
+        snippet = re.sub(r"\\u0022", '"', snippet)
+        snippet = re.sub(r",(\s*[}\]])", r"\1", snippet)
+        try:
+            return json.loads(snippet)
+        except json.JSONDecodeError:
+            pass
+
+    # fallback for independent keys
+    result = {}
+    for key in [
+        "attributes", "energy", "rentEstimate",
+        "saleEstimate", "saleHistory", "historicSales"
+    ]:
+        key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL)
+        if key_match:
+            try:
+                result[key] = json.loads(key_match.group(1))
+            except Exception:
+                pass
+    return result
+
+
 def scrape_all_estimates(session, url):
    """Scrape valuation estimates for one Zoopla property URL."""
    resp = session.get(url, impersonate=random.choice(ENGINES))
+    html = resp.text
    page_source = BeautifulSoup(resp.text, "html.parser")
    estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
+
+    data = extract_embedded_json(html)
+
    is_blocked = len(estimates) == 0
-    return estimates, is_blocked, resp.text
+
+    return {
+        "estimates": estimates,
+        "is_blocked": is_blocked,
+        "response_html": html,
+        "attributes": data.get("attributes"),
+        "rent": data.get("rentEstimate"),
+        "historicSales": data.get("historicSales"),
+    }


 def extract_estimates(estimates):
@ -50,19 +108,36 @@ def parallel_task(url):
        html = open(cache_path, "r").read()
        page_source = BeautifulSoup(html, "html.parser")
        estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
+        data = extract_embedded_json(html)
+        history_sales = data.get("historicSales", [{}])
+        if len(history_sales) == 0:
+            history_sales = [{}]
+
        if estimates:
            low, mid, high = extract_estimates(estimates)
-            return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
+            return {
+                "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
+                **data.get("attributes", {}), **data.get("rentEstimate", {}),
+                **history_sales[0]
+            }

    # Otherwise scrape live
    with StealthSession() as session:
        attempts = 0
        while attempts < 5:
-            estimates, is_blocked, html = scrape_all_estimates(session, url)
-            if not is_blocked and estimates:
-                open(cache_path, "w").write(html)
-                low, mid, high = extract_estimates(estimates)
-                return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
+            output = scrape_all_estimates(session, url)
+            if not output["is_blocked"] and output["estimates"]:
+                open(cache_path, "w").write(output["html"])
+                low, mid, high = extract_estimates(output["estimates"])
+                history_sales = output.get("historicSales", [{}])
+                if len(history_sales) == 0:
+                    history_sales = [{}]
+                return {
+                    "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
+                    **output.get("attributes", {}),
+                    **output.get("rent", {}),
+                    **history_sales[0]
+                }
            attempts += 1
            print(f"[Attempt {attempts}] Blocked or empty for {url}")
            random_delay()
@ -108,6 +183,9 @@ if __name__ == "__main__":
        )

    df = pd.DataFrame(estimates_list)
+
+    print(df.head())
+
    df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
    df["valuation"] = df["Middle Estimate"].apply(parse_price)

--- a/recommendations/HeatingRecommender.py
+++ b/recommendations/HeatingRecommender.py
@ -1257,9 +1257,11 @@ class HeatingRecommender:
            # If there is not a system change, we add the boiler recommendation at point.
            self.heating_recommendations.extend([boiler_recommendation])

-        if system_change:
+        if system_change and len(boiler_recommendation):
            # We combine the heating and controls recommendations, in the case of a system change
-            # If this is true, we set SAP points to None and survey to False for the boiler recommendation
+            # If this is true, we set SAP points to None and survey to False for the boiler recommendation.
+            # We check if we actually have a boiler recommendation as we may not if the heating and hot water
+            # are already efficient enough

            combined_recommendations = []
            for controls_recommendation in controls_recommender.recommendation: