diff --git a/.idea/Model.iml b/.idea/Model.iml
index c6561970..09f2e496 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..fb10c6b0 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
-
+
diff --git a/asset_list/app.py b/asset_list/app.py
index bb5cb427..b832a3e8 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -298,13 +298,13 @@ def app():
landlord_block_reference = None
# Project from Nick
- data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/Sep2025 Project"
- data_filename = "AL Test.xlsx"
- sheet_name = "Sheet1"
- postcode_column = 'postcode'
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio"
+ data_filename = "22.10 AL Portfolio.xlsx"
+ sheet_name = "22.10 AL Portfolio"
+ postcode_column = 'Postcode'
address1_column = None
address1_method = 'house_number_extraction'
- fulladdress_column = "address"
+ fulladdress_column = "Address"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
@@ -315,7 +315,7 @@ def app():
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
- landlord_property_id = "row_id"
+ landlord_property_id = "Row ID"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py
index 7b3fd5b6..2c446dc8 100644
--- a/etl/webscrape/Zoopla.py
+++ b/etl/webscrape/Zoopla.py
@@ -3,109 +3,126 @@ import pandas as pd
import time
from stealth_requests import StealthSession
import random
+import os
from multiprocessing import Pool
from tqdm import tqdm
ENGINES = ["safari", "chrome"]
+CACHE_DIR = "zoopla_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+
+
+def random_delay():
+ """Pause randomly between requests (0.5–2 s)."""
+ time.sleep(random.uniform(0.5, 2))
def scrape_all_estimates(session, url):
- # Rotate impersonation per request
- resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
+ """Scrape valuation estimates for one Zoopla property URL."""
+ resp = session.get(url, impersonate=random.choice(ENGINES))
page_source = BeautifulSoup(resp.text, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
is_blocked = len(estimates) == 0
- return estimates, is_blocked
+ return estimates, is_blocked, resp.text
+
+
+def extract_estimates(estimates):
+ """Extract low, mid, and high estimates from parsed HTML."""
+ est = estimates[0]
+ low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
+ mid = est.find("p", {"data-testid": "estimate-blurred"}).text
+ high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
+ return low, mid, high
+
+
+def cache_path_for_url(url):
+ """Return a deterministic local cache path for a URL."""
+ uprn = url.split("/")[-2]
+ return os.path.join(CACHE_DIR, f"{uprn}.html")
def parallel_task(url):
- # No impersonate argument here
+ """Main worker function executed in each process."""
+ cache_path = cache_path_for_url(url)
+
+ # Use cached file if it exists
+ if os.path.exists(cache_path):
+ html = open(cache_path, "r").read()
+ page_source = BeautifulSoup(html, "html.parser")
+ estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
+ if estimates:
+ low, mid, high = extract_estimates(estimates)
+ return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
+
+ # Otherwise scrape live
with StealthSession() as session:
- estimates, is_blocked = scrape_all_estimates(session, url)
+ attempts = 0
+ while attempts < 5:
+ estimates, is_blocked, html = scrape_all_estimates(session, url)
+ if not is_blocked and estimates:
+ open(cache_path, "w").write(html)
+ low, mid, high = extract_estimates(estimates)
+ return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
+ attempts += 1
+ print(f"[Attempt {attempts}] Blocked or empty for {url}")
+ random_delay()
- while is_blocked:
- print(f"Blocked by Zoopla for URL: {url}")
- time.sleep(random.uniform(0, 1))
- estimates, is_blocked = scrape_all_estimates(session, url)
-
- low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
- middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
- high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
-
- return {
- "URL": url,
- "Low Estimate": low_estimate,
- "Middle Estimate": middle_estimate,
- "High Estimate": high_estimate,
- }
+ # If still blocked, return placeholders
+ return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None}
def parse_price(p):
+ if p is None:
+ return None
+
p = p.replace("£", "").strip().lower()
+ if not p:
+ return None
if p.endswith("k"):
- return float(p[:-1]) * 1000
+ return float(p[:-1]) * 1_000
elif p.endswith("m"):
return float(p[:-1]) * 1_000_000
else:
- return float(p)
-
-
-# def parallel_task(url):
-# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
-# estimates, is_blocked = scrape_all_estimates(session, url)
-#
-# while is_blocked:
-# # Will need to wait and retry if blocked by Zoopla
-# print(f"Blocked by Zoopla for URL: {url}")
-# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
-# time.sleep(sleep_factor * 1)
-# estimates, is_blocked = scrape_all_estimates(session, url)
-#
-# low_estimate = (
-# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
-# ) # Find all span elements with data-testid="low-estimate"
-# middle_estimate = (
-# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
-# ) # Find all span elements with data-testid="middle-estimate"
-# high_estimate = (
-# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
-# ) # Find all span elements with data-testid="high-estimate-blurred"
-#
-# return {
-# "URL": url,
-# "Low Estimate": low_estimate,
-# "Middle Estimate": middle_estimate,
-# "High Estimate": high_estimate,
-# }
+ try:
+ return float(p.replace(",", ""))
+ except ValueError:
+ return None
if __name__ == "__main__":
- # Get a SAL
+ # Load portfolio
asset_list = pd.read_excel(
- "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
- "Standardised.xlsx",
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - "
+ "Standardised - partial UPRN fill.xlsx",
sheet_name="Standardised Asset List"
)
+ asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
uprns = asset_list["epc_os_uprn"].tolist()
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
- with Pool(processes=5) as pool:
+ # Limit concurrency to avoid blocks
+ with Pool(processes=2) as pool: # fewer processes = fewer fingerprints
estimates_list = list(
- tqdm(
- pool.imap(parallel_task, urls),
- total=len(urls),
- )
+ tqdm(pool.imap(parallel_task, urls), total=len(urls))
)
df = pd.DataFrame(estimates_list)
- # Extract UPRN from URL
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
df["valuation"] = df["Middle Estimate"].apply(parse_price)
+
df.to_csv("zoopla_estimates.csv", index=False)
- df["uprn"] = df["uprn"].astype(int).astype(str)
-
- asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
- "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
+ # Merge with asset list
+ merged = asset_list.merge(
+ df[["uprn", "valuation"]],
+ left_on="epc_os_uprn",
+ right_on="uprn",
+ how="left"
)
+ merged.to_excel(
+ "20251029 AL Portfolio - Standardised - with valuations.xlsx",
+ index=False
+ )
+
+ print("Done. Results saved.")
diff --git a/recommendations/optimiser/funding_optimiser.py b/recommendations/optimiser/funding_optimiser.py
index 4da08587..5acdd5fd 100644
--- a/recommendations/optimiser/funding_optimiser.py
+++ b/recommendations/optimiser/funding_optimiser.py
@@ -427,8 +427,9 @@ def optimise_with_funding_paths(p, input_measures, housing_type, funding: Fundin
solutions["meets_upgrade_target"] = solutions["total_gain"] >= target_gain - 0.1
# If we have packages that are fundable, but do not meet the upgrade target, we can run a final optimisation pass
- if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
- logger.info("We have some packages that are fundable but do not meet the target gain")
+ # Turned off logging - too noisy
+ # if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
+ # logger.info("We have some packages that are fundable but do not meet the target gain")
# We now can calculate the project ABS, which subtracts from the cost, but this is only relevant for ECO4
solutions["starting_sap"] = p.data["current-energy-efficiency"]