allow no valuation and work with relative

This commit is contained in:
Khalim Conn-Kowlessar 2025-10-27 15:27:32 +00:00
parent e5272e2e64
commit 93723697a1
8 changed files with 265 additions and 53 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>

View file

@ -59,9 +59,111 @@ def app():
Property UPRN
"""
#
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cambridge/"
data_filename = "22.10_Cambridge_west addresses.xlsx"
sheet_name = "Asset List"
postcode_column = 'Postcode'
address1_column = None
address1_method = "house_number_extraction"
fulladdress_column = "Full Address"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = None
landlord_built_form = None
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "id"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
outcomes_address = None
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = None
# Property Box
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box"
data_filename = "Property Box Finance Portfolio.xlsx"
sheet_name = "Sheet1"
postcode_column = 'Postcode'
address1_column = None
address1_method = "house_number_extraction"
fulladdress_column = "Address 1"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = None
landlord_built_form = None
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "row_id"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
outcomes_address = None
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = "block_id"
# CDS - able-to-pay
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/Able to pay"
data_filename = "CDS_ASSET LIST_(2314).xlsx"
sheet_name = "Sheet1"
postcode_column = 'Property Address - Postcode'
address1_column = "Property Address - Line 1"
address1_method = None
fulladdress_column = "Property Address - Line 1"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = None
landlord_built_form = None
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "row_id"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
outcomes_address = None
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = None
# Hyde - solar
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Solar"
data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
sheet_name = "Electric Property Inspections"
postcode_column = 'Postcode'
address1_column = None # Is only patchily populated so we create it
@ -88,14 +190,14 @@ def app():
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
phase = True
phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = None
# Hyde cavity
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Cavity"
data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
sheet_name = "Cavity Inspections"
postcode_column = 'Postcode'
address1_column = None # Is only patchily populated so we create it
@ -122,7 +224,7 @@ def app():
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
phase = True
phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = None

View file

@ -347,7 +347,8 @@ class SearchEpc:
# We update the data with the correct uprn
if self.uprn:
for x in api_response["response"]["rows"]:
x["uprn"] = self.uprn
if pd.isnull(x["uprn"]):
x["uprn"] = self.uprn
data["rows"].extend(api_response["response"]["rows"])
@ -357,6 +358,8 @@ class SearchEpc:
row for row in data["rows"]
if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
]
# Overwrite the data
self.data = data
if data["rows"]:
api_response["msg"] = self.SUCCESS

View file

@ -145,14 +145,17 @@ def extract_portfolio_aggregation_data(
cost = sum([r["total"] for r in default_recommendations])
sap_point_improvement = sum([r["sap_points"] for r in default_recommendations])
lower_bound_valuation_uplift = (
property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
property_value_increase_ranges[p.id]["current_value"]
)
upper_bound_valuation_uplift = (
property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
property_value_increase_ranges[p.id]["current_value"]
)
if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]):
lower_bound_valuation_uplift = (
property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
property_value_increase_ranges[p.id]["current_value"]
)
upper_bound_valuation_uplift = (
property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
property_value_increase_ranges[p.id]["current_value"]
)
else:
lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0
agg_data.append({
"pre_retrofit_epc": p.data["current-energy-rating"],
@ -523,6 +526,7 @@ async def model_engine(body: PlanTriggerRequest):
plan_input["built_form"] = plan_input["built_form"].map(built_form_map)
plan_input = plan_input.to_dict("records")
else:
raise ValueError("Other formats not yet supported")
@ -549,6 +553,13 @@ async def model_engine(body: PlanTriggerRequest):
# If we have patches or overrides, we should read them in here
patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body)
if body.file_type == "xlsx" and body.file_format == "domna_asset_list":
# We check if we have valution data
if not valuation_data and body.valuation_file_path in [None, ""]:
# We check plan_input
if "domna_valuation" in plan_input[0]:
valuation_data = [{"uprn": x["uprn"], "valuation": x["domna_valuation"]} for x in plan_input]
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
)
@ -563,12 +574,22 @@ async def model_engine(body: PlanTriggerRequest):
if uprn:
uprn = int(float(uprn))
address1 = config.get("address", None)
# Handle domna address list format
if pd.isnull(address1) and body.file_format == "domna_asset_list":
address1 = config.get("domna_full_address", None)
address1 = str(int(address1)) if isinstance(address1, float) else str(address1)
full_address = config["domna_full_address"] if body.file_format == "domna_asset_list" else None
epc_searcher = SearchEpc(
address1=str(config["address"]),
address1=address1,
postcode=config["postcode"],
uprn=uprn,
auth_token=get_settings().EPC_AUTH_TOKEN,
os_api_key="",
full_address=full_address
)
epc_searcher.ordnance_survey_client.built_form = config.get("built_form", None)
epc_searcher.ordnance_survey_client.property_type = config.get("property_type", None)
@ -1176,9 +1197,10 @@ async def model_engine(body: PlanTriggerRequest):
upload_funding(session, p, new_plan_id, recommendations_to_upload)
property_valuation_increases.append(
valuations["average_increased_value"] - valuations["current_value"]
)
if valuations["current_value"] > 0:
property_valuation_increases.append(
valuations["average_increased_value"] - valuations["current_value"]
)
# Commit the session after each batch
session.commit()

View file

@ -219,12 +219,19 @@ class PropertyValuation:
current_epc = property_instance.data["current-energy-rating"]
if not current_value:
# In this case, we return a % improvement rather than an absolute
relative_improvement = cls.estimate_valuation_improvement(
current_value=1,
current_epc=current_epc,
target_epc=target_epc,
total_cost=1
)
return {
"current_value": 0,
"lower_bound_increased_value": 0,
"upper_bound_increased_value": 0,
"average_increased_value": 0,
"average_increase": 0
"lower_bound_increased_value": relative_improvement["lower_bound_increased_value"] - 1,
"upper_bound_increased_value": relative_improvement["upper_bound_increased_value"] - 1,
"average_increased_value": relative_improvement["average_increased_value"] - 1,
"average_increase": relative_improvement["average_increase"]
}
return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)

View file

@ -1,38 +1,111 @@
# Initial Code
from seleniumbase import SB
from bs4 import BeautifulSoup
import pandas as pd
import time
from stealth_requests import StealthSession
import random
from multiprocessing import Pool
from tqdm import tqdm
uprns = [
100071297618,
100080893397,
100060778033,
200004793081,
100071265143,
100071297618,
100080893397,
100060778033,
200004793081,
100071265143,
]
ENGINES = ["safari", "chrome"]
estimate_list = []
for uprn in uprns:
def scrape_all_estimates(session, url):
# Rotate impersonation per request
resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
page_source = BeautifulSoup(resp.text, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
is_blocked = len(estimates) == 0
return estimates, is_blocked
# Probably can change the timings here
time.sleep(5)
with SB(uc=True) as sb:
sb.uc_open_with_reconnect(
f"https://www.zoopla.co.uk/property/uprn/{uprn}/",
3,
def parallel_task(url):
# No impersonate argument here
with StealthSession() as session:
estimates, is_blocked = scrape_all_estimates(session, url)
while is_blocked:
print(f"Blocked by Zoopla for URL: {url}")
time.sleep(random.uniform(0, 1))
estimates, is_blocked = scrape_all_estimates(session, url)
low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
return {
"URL": url,
"Low Estimate": low_estimate,
"Middle Estimate": middle_estimate,
"High Estimate": high_estimate,
}
def parse_price(p):
p = p.replace("£", "").strip().lower()
if p.endswith("k"):
return float(p[:-1]) * 1000
elif p.endswith("m"):
return float(p[:-1]) * 1_000_000
else:
return float(p)
# def parallel_task(url):
# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
# estimates, is_blocked = scrape_all_estimates(session, url)
#
# while is_blocked:
# # Will need to wait and retry if blocked by Zoopla
# print(f"Blocked by Zoopla for URL: {url}")
# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
# time.sleep(sleep_factor * 1)
# estimates, is_blocked = scrape_all_estimates(session, url)
#
# low_estimate = (
# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
# ) # Find all span elements with data-testid="low-estimate"
# middle_estimate = (
# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
# ) # Find all span elements with data-testid="middle-estimate"
# high_estimate = (
# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
# ) # Find all span elements with data-testid="high-estimate-blurred"
#
# return {
# "URL": url,
# "Low Estimate": low_estimate,
# "Middle Estimate": middle_estimate,
# "High Estimate": high_estimate,
# }
if __name__ == "__main__":
# Get a SAL
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
"Standardised.xlsx",
sheet_name="Standardised Asset List"
)
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
uprns = asset_list["epc_os_uprn"].tolist()
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
with Pool(processes=5) as pool:
estimates_list = list(
tqdm(
pool.imap(parallel_task, urls),
total=len(urls),
)
)
soup = sb.get_beautiful_soup()
df = pd.DataFrame(estimates_list)
# Extract UPRN from URL
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
df["valuation"] = df["Middle Estimate"].apply(parse_price)
df.to_csv("zoopla_estimates.csv", index=False)
estimates = soup.find_all("div", {"data-testid": "sale-estimate"})
# Can change the way we extract the text here
estimate_text = (
estimates[-1].find_all("p")[-1].find_all("span")[-1]["aria-label"]
)
estimate_list.append(estimate_text)
df["uprn"] = df["uprn"].astype(int).astype(str)
asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
"Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
)

View file

@ -0,0 +1,5 @@
beautifulsoup4>=4.12.0
pandas>=2.0.0
stealth-requests>=1.0.7
tqdm>=4.65.0
openpyxl