diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 8c502021..af57e35a 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -43,18 +43,6 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest): logger.error("Failed to parse request body: %s", e) return {"message": "Invalid request"}, 400 - # TODO: Warm up the lambdas here - # from backend.ml_models.api import ModelApi - # model_api = ModelApi( - # portfolio_id=body.portfolio_id, - # timestamp="2020-01-01T00:00:00", - # prediction_buckets=[], - # max_retries=1 - # ) - # await model_api.async_warm_up_lambdas( - # model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES - # ) - # If file_format is domna_asset_list and type is xlsx, read and chunk it if data.get("file_format") == "domna_asset_list" and data.get("file_type") == "xlsx": try: @@ -94,6 +82,7 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest): data["scenario_id"] = scenario_id for i in range(total_chunks): + # Create an entry in the request logs table index_start = i * chunk_size index_end = min((i + 1) * chunk_size, total_rows) diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py index 2c446dc8..4c0443f1 100644 --- a/etl/webscrape/Zoopla.py +++ b/etl/webscrape/Zoopla.py @@ -6,6 +6,8 @@ import random import os from multiprocessing import Pool from tqdm import tqdm +import re +import json ENGINES = ["safari", "chrome"] CACHE_DIR = "zoopla_cache" @@ -17,13 +19,69 @@ def random_delay(): time.sleep(random.uniform(0.5, 2)) +def extract_feature(soup, icon_id): + tag = soup.find("use", href=f"#{icon_id}") + if tag: + parent = tag.find_parent("div", class_="_1pbf8i53") + if parent: + text = parent.get_text(strip=True) + return text + return None + + +def extract_embedded_json(text): + """ + Extract embedded property JSON containing attributes, energy, estimates, and sales history. + """ + # Try to grab everything after "attributes" + match = re.search( + r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]', + text, + re.DOTALL + ) + if match: + snippet = "{" + match.group(0) + "}" + snippet = re.sub(r"\\u0022", '"', snippet) + snippet = re.sub(r",(\s*[}\]])", r"\1", snippet) + try: + return json.loads(snippet) + except json.JSONDecodeError: + pass + + # fallback for independent keys + result = {} + for key in [ + "attributes", "energy", "rentEstimate", + "saleEstimate", "saleHistory", "historicSales" + ]: + key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL) + if key_match: + try: + result[key] = json.loads(key_match.group(1)) + except Exception: + pass + return result + + def scrape_all_estimates(session, url): """Scrape valuation estimates for one Zoopla property URL.""" resp = session.get(url, impersonate=random.choice(ENGINES)) + html = resp.text page_source = BeautifulSoup(resp.text, "html.parser") estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) + + data = extract_embedded_json(html) + is_blocked = len(estimates) == 0 - return estimates, is_blocked, resp.text + + return { + "estimates": estimates, + "is_blocked": is_blocked, + "response_html": html, + "attributes": data.get("attributes"), + "rent": data.get("rentEstimate"), + "historicSales": data.get("historicSales"), + } def extract_estimates(estimates): @@ -50,19 +108,36 @@ def parallel_task(url): html = open(cache_path, "r").read() page_source = BeautifulSoup(html, "html.parser") estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) + data = extract_embedded_json(html) + history_sales = data.get("historicSales", [{}]) + if len(history_sales) == 0: + history_sales = [{}] + if estimates: low, mid, high = extract_estimates(estimates) - return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high} + return { + "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high, + **data.get("attributes", {}), **data.get("rentEstimate", {}), + **history_sales[0] + } # Otherwise scrape live with StealthSession() as session: attempts = 0 while attempts < 5: - estimates, is_blocked, html = scrape_all_estimates(session, url) - if not is_blocked and estimates: - open(cache_path, "w").write(html) - low, mid, high = extract_estimates(estimates) - return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high} + output = scrape_all_estimates(session, url) + if not output["is_blocked"] and output["estimates"]: + open(cache_path, "w").write(output["html"]) + low, mid, high = extract_estimates(output["estimates"]) + history_sales = output.get("historicSales", [{}]) + if len(history_sales) == 0: + history_sales = [{}] + return { + "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high, + **output.get("attributes", {}), + **output.get("rent", {}), + **history_sales[0] + } attempts += 1 print(f"[Attempt {attempts}] Blocked or empty for {url}") random_delay() @@ -108,6 +183,9 @@ if __name__ == "__main__": ) df = pd.DataFrame(estimates_list) + + print(df.head()) + df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/") df["valuation"] = df["Middle Estimate"].apply(parse_price) diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index b8a1b5a7..d84a47b5 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -1257,9 +1257,11 @@ class HeatingRecommender: # If there is not a system change, we add the boiler recommendation at point. self.heating_recommendations.extend([boiler_recommendation]) - if system_change: + if system_change and len(boiler_recommendation): # We combine the heating and controls recommendations, in the case of a system change - # If this is true, we set SAP points to None and survey to False for the boiler recommendation + # If this is true, we set SAP points to None and survey to False for the boiler recommendation. + # We check if we actually have a boiler recommendation as we may not if the heating and hot water + # are already efficient enough combined_recommendations = [] for controls_recommendation in controls_recommender.recommendation: