diff --git a/.idea/Model.iml b/.idea/Model.iml
index 09f2e496..c6561970 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index fb10c6b0..50cad4ca 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
-
+
diff --git a/asset_list/app.py b/asset_list/app.py
index 2903e083..20cf04f1 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -59,9 +59,111 @@ def app():
Property UPRN
"""
+ #
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cambridge/"
+ data_filename = "22.10_Cambridge_west addresses.xlsx"
+ sheet_name = "Asset List"
+ postcode_column = 'Postcode'
+ address1_column = None
+ address1_method = "house_number_extraction"
+ fulladdress_column = "Full Address"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = None
+ landlord_os_uprn = None
+ landlord_property_type = None
+ landlord_built_form = None
+ landlord_wall_construction = None
+ landlord_roof_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "id"
+ landlord_sap = None
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ outcomes_address = None
+ master_filepaths = []
+ master_id_colnames = []
+ master_to_asset_list_filepath = None
+ phase = False
+ ecosurv_landlords = None
+ asset_list_header = 0
+ landlord_block_reference = None
+
+ # Property Box
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box"
+ data_filename = "Property Box Finance Portfolio.xlsx"
+ sheet_name = "Sheet1"
+ postcode_column = 'Postcode'
+ address1_column = None
+ address1_method = "house_number_extraction"
+ fulladdress_column = "Address 1"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = None
+ landlord_os_uprn = None
+ landlord_property_type = None
+ landlord_built_form = None
+ landlord_wall_construction = None
+ landlord_roof_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "row_id"
+ landlord_sap = None
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ outcomes_address = None
+ master_filepaths = []
+ master_id_colnames = []
+ master_to_asset_list_filepath = None
+ phase = False
+ ecosurv_landlords = None
+ asset_list_header = 0
+ landlord_block_reference = "block_id"
+
+ # CDS - able-to-pay
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/Able to pay"
+ data_filename = "CDS_ASSET LIST_(2314).xlsx"
+ sheet_name = "Sheet1"
+ postcode_column = 'Property Address - Postcode'
+ address1_column = "Property Address - Line 1"
+ address1_method = None
+ fulladdress_column = "Property Address - Line 1"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = None
+ landlord_os_uprn = None
+ landlord_property_type = None
+ landlord_built_form = None
+ landlord_wall_construction = None
+ landlord_roof_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "row_id"
+ landlord_sap = None
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ outcomes_address = None
+ master_filepaths = []
+ master_id_colnames = []
+ master_to_asset_list_filepath = None
+ phase = False
+ ecosurv_landlords = None
+ asset_list_header = 0
+ landlord_block_reference = None
+
# Hyde - solar
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Solar"
- data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
+ data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
sheet_name = "Electric Property Inspections"
postcode_column = 'Postcode'
address1_column = None # Is only patchily populated so we create it
@@ -88,14 +190,14 @@ def app():
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
- phase = True
+ phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = None
# Hyde cavity
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Cavity"
- data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
+ data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
sheet_name = "Cavity Inspections"
postcode_column = 'Postcode'
address1_column = None # Is only patchily populated so we create it
@@ -122,7 +224,7 @@ def app():
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
- phase = True
+ phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = None
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index 16dd8f04..1a14e87a 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -347,7 +347,8 @@ class SearchEpc:
# We update the data with the correct uprn
if self.uprn:
for x in api_response["response"]["rows"]:
- x["uprn"] = self.uprn
+ if pd.isnull(x["uprn"]):
+ x["uprn"] = self.uprn
data["rows"].extend(api_response["response"]["rows"])
@@ -357,6 +358,8 @@ class SearchEpc:
row for row in data["rows"]
if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
]
+ # Overwrite the data
+ self.data = data
if data["rows"]:
api_response["msg"] = self.SUCCESS
diff --git a/backend/engine/engine.py b/backend/engine/engine.py
index fa1f191c..f2674290 100644
--- a/backend/engine/engine.py
+++ b/backend/engine/engine.py
@@ -145,14 +145,17 @@ def extract_portfolio_aggregation_data(
cost = sum([r["total"] for r in default_recommendations])
sap_point_improvement = sum([r["sap_points"] for r in default_recommendations])
- lower_bound_valuation_uplift = (
- property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
- property_value_increase_ranges[p.id]["current_value"]
- )
- upper_bound_valuation_uplift = (
- property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
- property_value_increase_ranges[p.id]["current_value"]
- )
+ if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]):
+ lower_bound_valuation_uplift = (
+ property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
+ property_value_increase_ranges[p.id]["current_value"]
+ )
+ upper_bound_valuation_uplift = (
+ property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
+ property_value_increase_ranges[p.id]["current_value"]
+ )
+ else:
+ lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0
agg_data.append({
"pre_retrofit_epc": p.data["current-energy-rating"],
@@ -523,6 +526,7 @@ async def model_engine(body: PlanTriggerRequest):
plan_input["built_form"] = plan_input["built_form"].map(built_form_map)
plan_input = plan_input.to_dict("records")
+
else:
raise ValueError("Other formats not yet supported")
@@ -549,6 +553,13 @@ async def model_engine(body: PlanTriggerRequest):
# If we have patches or overrides, we should read them in here
patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body)
+ if body.file_type == "xlsx" and body.file_format == "domna_asset_list":
+ # We check if we have valution data
+ if not valuation_data and body.valuation_file_path in [None, ""]:
+ # We check plan_input
+ if "domna_valuation" in plan_input[0]:
+ valuation_data = [{"uprn": x["uprn"], "valuation": x["domna_valuation"]} for x in plan_input]
+
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
)
@@ -563,12 +574,22 @@ async def model_engine(body: PlanTriggerRequest):
if uprn:
uprn = int(float(uprn))
+ address1 = config.get("address", None)
+ # Handle domna address list format
+ if pd.isnull(address1) and body.file_format == "domna_asset_list":
+ address1 = config.get("domna_full_address", None)
+
+ address1 = str(int(address1)) if isinstance(address1, float) else str(address1)
+
+ full_address = config["domna_full_address"] if body.file_format == "domna_asset_list" else None
+
epc_searcher = SearchEpc(
- address1=str(config["address"]),
+ address1=address1,
postcode=config["postcode"],
uprn=uprn,
auth_token=get_settings().EPC_AUTH_TOKEN,
os_api_key="",
+ full_address=full_address
)
epc_searcher.ordnance_survey_client.built_form = config.get("built_form", None)
epc_searcher.ordnance_survey_client.property_type = config.get("property_type", None)
@@ -1176,9 +1197,10 @@ async def model_engine(body: PlanTriggerRequest):
upload_funding(session, p, new_plan_id, recommendations_to_upload)
- property_valuation_increases.append(
- valuations["average_increased_value"] - valuations["current_value"]
- )
+ if valuations["current_value"] > 0:
+ property_valuation_increases.append(
+ valuations["average_increased_value"] - valuations["current_value"]
+ )
# Commit the session after each batch
session.commit()
diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index 8c57900f..17db0dae 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -219,12 +219,19 @@ class PropertyValuation:
current_epc = property_instance.data["current-energy-rating"]
if not current_value:
+ # In this case, we return a % improvement rather than an absolute
+ relative_improvement = cls.estimate_valuation_improvement(
+ current_value=1,
+ current_epc=current_epc,
+ target_epc=target_epc,
+ total_cost=1
+ )
return {
"current_value": 0,
- "lower_bound_increased_value": 0,
- "upper_bound_increased_value": 0,
- "average_increased_value": 0,
- "average_increase": 0
+ "lower_bound_increased_value": relative_improvement["lower_bound_increased_value"] - 1,
+ "upper_bound_increased_value": relative_improvement["upper_bound_increased_value"] - 1,
+ "average_increased_value": relative_improvement["average_increased_value"] - 1,
+ "average_increase": relative_improvement["average_increase"]
}
return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)
diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py
index bb86c759..7b3fd5b6 100644
--- a/etl/webscrape/Zoopla.py
+++ b/etl/webscrape/Zoopla.py
@@ -1,38 +1,111 @@
-# Initial Code
-
-from seleniumbase import SB
+from bs4 import BeautifulSoup
+import pandas as pd
import time
+from stealth_requests import StealthSession
+import random
+from multiprocessing import Pool
+from tqdm import tqdm
-uprns = [
- 100071297618,
- 100080893397,
- 100060778033,
- 200004793081,
- 100071265143,
- 100071297618,
- 100080893397,
- 100060778033,
- 200004793081,
- 100071265143,
-]
+ENGINES = ["safari", "chrome"]
-estimate_list = []
-for uprn in uprns:
+def scrape_all_estimates(session, url):
+ # Rotate impersonation per request
+ resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
+ page_source = BeautifulSoup(resp.text, "html.parser")
+ estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
+ is_blocked = len(estimates) == 0
+ return estimates, is_blocked
- # Probably can change the timings here
- time.sleep(5)
- with SB(uc=True) as sb:
- sb.uc_open_with_reconnect(
- f"https://www.zoopla.co.uk/property/uprn/{uprn}/",
- 3,
+
+def parallel_task(url):
+ # No impersonate argument here
+ with StealthSession() as session:
+ estimates, is_blocked = scrape_all_estimates(session, url)
+
+ while is_blocked:
+ print(f"Blocked by Zoopla for URL: {url}")
+ time.sleep(random.uniform(0, 1))
+ estimates, is_blocked = scrape_all_estimates(session, url)
+
+ low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
+ middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
+ high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
+
+ return {
+ "URL": url,
+ "Low Estimate": low_estimate,
+ "Middle Estimate": middle_estimate,
+ "High Estimate": high_estimate,
+ }
+
+
+def parse_price(p):
+ p = p.replace("£", "").strip().lower()
+ if p.endswith("k"):
+ return float(p[:-1]) * 1000
+ elif p.endswith("m"):
+ return float(p[:-1]) * 1_000_000
+ else:
+ return float(p)
+
+
+# def parallel_task(url):
+# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
+# estimates, is_blocked = scrape_all_estimates(session, url)
+#
+# while is_blocked:
+# # Will need to wait and retry if blocked by Zoopla
+# print(f"Blocked by Zoopla for URL: {url}")
+# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
+# time.sleep(sleep_factor * 1)
+# estimates, is_blocked = scrape_all_estimates(session, url)
+#
+# low_estimate = (
+# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
+# ) # Find all span elements with data-testid="low-estimate"
+# middle_estimate = (
+# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
+# ) # Find all span elements with data-testid="middle-estimate"
+# high_estimate = (
+# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
+# ) # Find all span elements with data-testid="high-estimate-blurred"
+#
+# return {
+# "URL": url,
+# "Low Estimate": low_estimate,
+# "Middle Estimate": middle_estimate,
+# "High Estimate": high_estimate,
+# }
+
+
+if __name__ == "__main__":
+ # Get a SAL
+ asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
+ "Standardised.xlsx",
+ sheet_name="Standardised Asset List"
+ )
+ asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
+ uprns = asset_list["epc_os_uprn"].tolist()
+ urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
+
+ with Pool(processes=5) as pool:
+ estimates_list = list(
+ tqdm(
+ pool.imap(parallel_task, urls),
+ total=len(urls),
+ )
)
- soup = sb.get_beautiful_soup()
+ df = pd.DataFrame(estimates_list)
+ # Extract UPRN from URL
+ df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
+ df["valuation"] = df["Middle Estimate"].apply(parse_price)
+ df.to_csv("zoopla_estimates.csv", index=False)
- estimates = soup.find_all("div", {"data-testid": "sale-estimate"})
- # Can change the way we extract the text here
- estimate_text = (
- estimates[-1].find_all("p")[-1].find_all("span")[-1]["aria-label"]
- )
- estimate_list.append(estimate_text)
+ df["uprn"] = df["uprn"].astype(int).astype(str)
+
+ asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
+ "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
+ )
diff --git a/etl/webscrape/requirements.txt b/etl/webscrape/requirements.txt
new file mode 100644
index 00000000..4027a224
--- /dev/null
+++ b/etl/webscrape/requirements.txt
@@ -0,0 +1,5 @@
+beautifulsoup4>=4.12.0
+pandas>=2.0.0
+stealth-requests>=1.0.7
+tqdm>=4.65.0
+openpyxl
\ No newline at end of file