Fixing boiler upgrade recommendation

This commit is contained in:
Khalim Conn-Kowlessar 2025-11-10 20:46:57 +00:00
parent 0f2a064f40
commit 4151b58dea
3 changed files with 90 additions and 21 deletions

View file

@ -43,18 +43,6 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest):
logger.error("Failed to parse request body: %s", e)
return {"message": "Invalid request"}, 400
# TODO: Warm up the lambdas here
# from backend.ml_models.api import ModelApi
# model_api = ModelApi(
# portfolio_id=body.portfolio_id,
# timestamp="2020-01-01T00:00:00",
# prediction_buckets=[],
# max_retries=1
# )
# await model_api.async_warm_up_lambdas(
# model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES
# )
# If file_format is domna_asset_list and type is xlsx, read and chunk it
if data.get("file_format") == "domna_asset_list" and data.get("file_type") == "xlsx":
try:
@ -94,6 +82,7 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest):
data["scenario_id"] = scenario_id
for i in range(total_chunks):
# Create an entry in the request logs table
index_start = i * chunk_size
index_end = min((i + 1) * chunk_size, total_rows)

View file

@ -6,6 +6,8 @@ import random
import os
from multiprocessing import Pool
from tqdm import tqdm
import re
import json
ENGINES = ["safari", "chrome"]
CACHE_DIR = "zoopla_cache"
@ -17,13 +19,69 @@ def random_delay():
time.sleep(random.uniform(0.5, 2))
def extract_feature(soup, icon_id):
tag = soup.find("use", href=f"#{icon_id}")
if tag:
parent = tag.find_parent("div", class_="_1pbf8i53")
if parent:
text = parent.get_text(strip=True)
return text
return None
def extract_embedded_json(text):
"""
Extract embedded property JSON containing attributes, energy, estimates, and sales history.
"""
# Try to grab everything after "attributes"
match = re.search(
r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]',
text,
re.DOTALL
)
if match:
snippet = "{" + match.group(0) + "}"
snippet = re.sub(r"\\u0022", '"', snippet)
snippet = re.sub(r",(\s*[}\]])", r"\1", snippet)
try:
return json.loads(snippet)
except json.JSONDecodeError:
pass
# fallback for independent keys
result = {}
for key in [
"attributes", "energy", "rentEstimate",
"saleEstimate", "saleHistory", "historicSales"
]:
key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL)
if key_match:
try:
result[key] = json.loads(key_match.group(1))
except Exception:
pass
return result
def scrape_all_estimates(session, url):
"""Scrape valuation estimates for one Zoopla property URL."""
resp = session.get(url, impersonate=random.choice(ENGINES))
html = resp.text
page_source = BeautifulSoup(resp.text, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
data = extract_embedded_json(html)
is_blocked = len(estimates) == 0
return estimates, is_blocked, resp.text
return {
"estimates": estimates,
"is_blocked": is_blocked,
"response_html": html,
"attributes": data.get("attributes"),
"rent": data.get("rentEstimate"),
"historicSales": data.get("historicSales"),
}
def extract_estimates(estimates):
@ -50,19 +108,36 @@ def parallel_task(url):
html = open(cache_path, "r").read()
page_source = BeautifulSoup(html, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
data = extract_embedded_json(html)
history_sales = data.get("historicSales", [{}])
if len(history_sales) == 0:
history_sales = [{}]
if estimates:
low, mid, high = extract_estimates(estimates)
return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
return {
"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
**data.get("attributes", {}), **data.get("rentEstimate", {}),
**history_sales[0]
}
# Otherwise scrape live
with StealthSession() as session:
attempts = 0
while attempts < 5:
estimates, is_blocked, html = scrape_all_estimates(session, url)
if not is_blocked and estimates:
open(cache_path, "w").write(html)
low, mid, high = extract_estimates(estimates)
return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
output = scrape_all_estimates(session, url)
if not output["is_blocked"] and output["estimates"]:
open(cache_path, "w").write(output["html"])
low, mid, high = extract_estimates(output["estimates"])
history_sales = output.get("historicSales", [{}])
if len(history_sales) == 0:
history_sales = [{}]
return {
"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
**output.get("attributes", {}),
**output.get("rent", {}),
**history_sales[0]
}
attempts += 1
print(f"[Attempt {attempts}] Blocked or empty for {url}")
random_delay()
@ -108,6 +183,9 @@ if __name__ == "__main__":
)
df = pd.DataFrame(estimates_list)
print(df.head())
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
df["valuation"] = df["Middle Estimate"].apply(parse_price)

View file

@ -1257,9 +1257,11 @@ class HeatingRecommender:
# If there is not a system change, we add the boiler recommendation at point.
self.heating_recommendations.extend([boiler_recommendation])
if system_change:
if system_change and len(boiler_recommendation):
# We combine the heating and controls recommendations, in the case of a system change
# If this is true, we set SAP points to None and survey to False for the boiler recommendation
# If this is true, we set SAP points to None and survey to False for the boiler recommendation.
# We check if we actually have a boiler recommendation as we may not if the heating and hot water
# are already efficient enough
combined_recommendations = []
for controls_recommendation in controls_recommender.recommendation: