mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
allow no valuation and work with relative
This commit is contained in:
parent
e5272e2e64
commit
93723697a1
8 changed files with 265 additions and 53 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -59,9 +59,111 @@ def app():
|
|||
Property UPRN
|
||||
"""
|
||||
|
||||
#
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cambridge/"
|
||||
data_filename = "22.10_Cambridge_west addresses.xlsx"
|
||||
sheet_name = "Asset List"
|
||||
postcode_column = 'Postcode'
|
||||
address1_column = None
|
||||
address1_method = "house_number_extraction"
|
||||
fulladdress_column = "Full Address"
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = None
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = None
|
||||
landlord_built_form = None
|
||||
landlord_wall_construction = None
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = None
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "id"
|
||||
landlord_sap = None
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
outcomes_postcode = None
|
||||
outcomes_houseno = None
|
||||
outcomes_id = None
|
||||
outcomes_address = None
|
||||
master_filepaths = []
|
||||
master_id_colnames = []
|
||||
master_to_asset_list_filepath = None
|
||||
phase = False
|
||||
ecosurv_landlords = None
|
||||
asset_list_header = 0
|
||||
landlord_block_reference = None
|
||||
|
||||
# Property Box
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box"
|
||||
data_filename = "Property Box Finance Portfolio.xlsx"
|
||||
sheet_name = "Sheet1"
|
||||
postcode_column = 'Postcode'
|
||||
address1_column = None
|
||||
address1_method = "house_number_extraction"
|
||||
fulladdress_column = "Address 1"
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = None
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = None
|
||||
landlord_built_form = None
|
||||
landlord_wall_construction = None
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = None
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "row_id"
|
||||
landlord_sap = None
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
outcomes_postcode = None
|
||||
outcomes_houseno = None
|
||||
outcomes_id = None
|
||||
outcomes_address = None
|
||||
master_filepaths = []
|
||||
master_id_colnames = []
|
||||
master_to_asset_list_filepath = None
|
||||
phase = False
|
||||
ecosurv_landlords = None
|
||||
asset_list_header = 0
|
||||
landlord_block_reference = "block_id"
|
||||
|
||||
# CDS - able-to-pay
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/Able to pay"
|
||||
data_filename = "CDS_ASSET LIST_(2314).xlsx"
|
||||
sheet_name = "Sheet1"
|
||||
postcode_column = 'Property Address - Postcode'
|
||||
address1_column = "Property Address - Line 1"
|
||||
address1_method = None
|
||||
fulladdress_column = "Property Address - Line 1"
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = None
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = None
|
||||
landlord_built_form = None
|
||||
landlord_wall_construction = None
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = None
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "row_id"
|
||||
landlord_sap = None
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
outcomes_postcode = None
|
||||
outcomes_houseno = None
|
||||
outcomes_id = None
|
||||
outcomes_address = None
|
||||
master_filepaths = []
|
||||
master_id_colnames = []
|
||||
master_to_asset_list_filepath = None
|
||||
phase = False
|
||||
ecosurv_landlords = None
|
||||
asset_list_header = 0
|
||||
landlord_block_reference = None
|
||||
|
||||
# Hyde - solar
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Solar"
|
||||
data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
|
||||
data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
|
||||
sheet_name = "Electric Property Inspections"
|
||||
postcode_column = 'Postcode'
|
||||
address1_column = None # Is only patchily populated so we create it
|
||||
|
|
@ -88,14 +190,14 @@ def app():
|
|||
master_filepaths = []
|
||||
master_id_colnames = []
|
||||
master_to_asset_list_filepath = None
|
||||
phase = True
|
||||
phase = False
|
||||
ecosurv_landlords = None
|
||||
asset_list_header = 0
|
||||
landlord_block_reference = None
|
||||
|
||||
# Hyde cavity
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Cavity"
|
||||
data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
|
||||
data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
|
||||
sheet_name = "Cavity Inspections"
|
||||
postcode_column = 'Postcode'
|
||||
address1_column = None # Is only patchily populated so we create it
|
||||
|
|
@ -122,7 +224,7 @@ def app():
|
|||
master_filepaths = []
|
||||
master_id_colnames = []
|
||||
master_to_asset_list_filepath = None
|
||||
phase = True
|
||||
phase = False
|
||||
ecosurv_landlords = None
|
||||
asset_list_header = 0
|
||||
landlord_block_reference = None
|
||||
|
|
|
|||
|
|
@ -347,7 +347,8 @@ class SearchEpc:
|
|||
# We update the data with the correct uprn
|
||||
if self.uprn:
|
||||
for x in api_response["response"]["rows"]:
|
||||
x["uprn"] = self.uprn
|
||||
if pd.isnull(x["uprn"]):
|
||||
x["uprn"] = self.uprn
|
||||
|
||||
data["rows"].extend(api_response["response"]["rows"])
|
||||
|
||||
|
|
@ -357,6 +358,8 @@ class SearchEpc:
|
|||
row for row in data["rows"]
|
||||
if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
|
||||
]
|
||||
# Overwrite the data
|
||||
self.data = data
|
||||
|
||||
if data["rows"]:
|
||||
api_response["msg"] = self.SUCCESS
|
||||
|
|
|
|||
|
|
@ -145,14 +145,17 @@ def extract_portfolio_aggregation_data(
|
|||
cost = sum([r["total"] for r in default_recommendations])
|
||||
sap_point_improvement = sum([r["sap_points"] for r in default_recommendations])
|
||||
|
||||
lower_bound_valuation_uplift = (
|
||||
property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
|
||||
property_value_increase_ranges[p.id]["current_value"]
|
||||
)
|
||||
upper_bound_valuation_uplift = (
|
||||
property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
|
||||
property_value_increase_ranges[p.id]["current_value"]
|
||||
)
|
||||
if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]):
|
||||
lower_bound_valuation_uplift = (
|
||||
property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
|
||||
property_value_increase_ranges[p.id]["current_value"]
|
||||
)
|
||||
upper_bound_valuation_uplift = (
|
||||
property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
|
||||
property_value_increase_ranges[p.id]["current_value"]
|
||||
)
|
||||
else:
|
||||
lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0
|
||||
|
||||
agg_data.append({
|
||||
"pre_retrofit_epc": p.data["current-energy-rating"],
|
||||
|
|
@ -523,6 +526,7 @@ async def model_engine(body: PlanTriggerRequest):
|
|||
plan_input["built_form"] = plan_input["built_form"].map(built_form_map)
|
||||
|
||||
plan_input = plan_input.to_dict("records")
|
||||
|
||||
else:
|
||||
raise ValueError("Other formats not yet supported")
|
||||
|
||||
|
|
@ -549,6 +553,13 @@ async def model_engine(body: PlanTriggerRequest):
|
|||
# If we have patches or overrides, we should read them in here
|
||||
patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body)
|
||||
|
||||
if body.file_type == "xlsx" and body.file_format == "domna_asset_list":
|
||||
# We check if we have valution data
|
||||
if not valuation_data and body.valuation_file_path in [None, ""]:
|
||||
# We check plan_input
|
||||
if "domna_valuation" in plan_input[0]:
|
||||
valuation_data = [{"uprn": x["uprn"], "valuation": x["domna_valuation"]} for x in plan_input]
|
||||
|
||||
cleaning_data = read_dataframe_from_s3_parquet(
|
||||
bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
|
@ -563,12 +574,22 @@ async def model_engine(body: PlanTriggerRequest):
|
|||
if uprn:
|
||||
uprn = int(float(uprn))
|
||||
|
||||
address1 = config.get("address", None)
|
||||
# Handle domna address list format
|
||||
if pd.isnull(address1) and body.file_format == "domna_asset_list":
|
||||
address1 = config.get("domna_full_address", None)
|
||||
|
||||
address1 = str(int(address1)) if isinstance(address1, float) else str(address1)
|
||||
|
||||
full_address = config["domna_full_address"] if body.file_format == "domna_asset_list" else None
|
||||
|
||||
epc_searcher = SearchEpc(
|
||||
address1=str(config["address"]),
|
||||
address1=address1,
|
||||
postcode=config["postcode"],
|
||||
uprn=uprn,
|
||||
auth_token=get_settings().EPC_AUTH_TOKEN,
|
||||
os_api_key="",
|
||||
full_address=full_address
|
||||
)
|
||||
epc_searcher.ordnance_survey_client.built_form = config.get("built_form", None)
|
||||
epc_searcher.ordnance_survey_client.property_type = config.get("property_type", None)
|
||||
|
|
@ -1176,9 +1197,10 @@ async def model_engine(body: PlanTriggerRequest):
|
|||
|
||||
upload_funding(session, p, new_plan_id, recommendations_to_upload)
|
||||
|
||||
property_valuation_increases.append(
|
||||
valuations["average_increased_value"] - valuations["current_value"]
|
||||
)
|
||||
if valuations["current_value"] > 0:
|
||||
property_valuation_increases.append(
|
||||
valuations["average_increased_value"] - valuations["current_value"]
|
||||
)
|
||||
|
||||
# Commit the session after each batch
|
||||
session.commit()
|
||||
|
|
|
|||
|
|
@ -219,12 +219,19 @@ class PropertyValuation:
|
|||
current_epc = property_instance.data["current-energy-rating"]
|
||||
|
||||
if not current_value:
|
||||
# In this case, we return a % improvement rather than an absolute
|
||||
relative_improvement = cls.estimate_valuation_improvement(
|
||||
current_value=1,
|
||||
current_epc=current_epc,
|
||||
target_epc=target_epc,
|
||||
total_cost=1
|
||||
)
|
||||
return {
|
||||
"current_value": 0,
|
||||
"lower_bound_increased_value": 0,
|
||||
"upper_bound_increased_value": 0,
|
||||
"average_increased_value": 0,
|
||||
"average_increase": 0
|
||||
"lower_bound_increased_value": relative_improvement["lower_bound_increased_value"] - 1,
|
||||
"upper_bound_increased_value": relative_improvement["upper_bound_increased_value"] - 1,
|
||||
"average_increased_value": relative_improvement["average_increased_value"] - 1,
|
||||
"average_increase": relative_improvement["average_increase"]
|
||||
}
|
||||
|
||||
return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)
|
||||
|
|
|
|||
|
|
@ -1,38 +1,111 @@
|
|||
# Initial Code
|
||||
|
||||
from seleniumbase import SB
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import time
|
||||
from stealth_requests import StealthSession
|
||||
import random
|
||||
from multiprocessing import Pool
|
||||
from tqdm import tqdm
|
||||
|
||||
uprns = [
|
||||
100071297618,
|
||||
100080893397,
|
||||
100060778033,
|
||||
200004793081,
|
||||
100071265143,
|
||||
100071297618,
|
||||
100080893397,
|
||||
100060778033,
|
||||
200004793081,
|
||||
100071265143,
|
||||
]
|
||||
ENGINES = ["safari", "chrome"]
|
||||
|
||||
estimate_list = []
|
||||
|
||||
for uprn in uprns:
|
||||
def scrape_all_estimates(session, url):
|
||||
# Rotate impersonation per request
|
||||
resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
|
||||
page_source = BeautifulSoup(resp.text, "html.parser")
|
||||
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
|
||||
is_blocked = len(estimates) == 0
|
||||
return estimates, is_blocked
|
||||
|
||||
# Probably can change the timings here
|
||||
time.sleep(5)
|
||||
with SB(uc=True) as sb:
|
||||
sb.uc_open_with_reconnect(
|
||||
f"https://www.zoopla.co.uk/property/uprn/{uprn}/",
|
||||
3,
|
||||
|
||||
def parallel_task(url):
|
||||
# No impersonate argument here
|
||||
with StealthSession() as session:
|
||||
estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
|
||||
while is_blocked:
|
||||
print(f"Blocked by Zoopla for URL: {url}")
|
||||
time.sleep(random.uniform(0, 1))
|
||||
estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
|
||||
low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
|
||||
middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
|
||||
high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
|
||||
|
||||
return {
|
||||
"URL": url,
|
||||
"Low Estimate": low_estimate,
|
||||
"Middle Estimate": middle_estimate,
|
||||
"High Estimate": high_estimate,
|
||||
}
|
||||
|
||||
|
||||
def parse_price(p):
|
||||
p = p.replace("£", "").strip().lower()
|
||||
if p.endswith("k"):
|
||||
return float(p[:-1]) * 1000
|
||||
elif p.endswith("m"):
|
||||
return float(p[:-1]) * 1_000_000
|
||||
else:
|
||||
return float(p)
|
||||
|
||||
|
||||
# def parallel_task(url):
|
||||
# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
|
||||
# estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
#
|
||||
# while is_blocked:
|
||||
# # Will need to wait and retry if blocked by Zoopla
|
||||
# print(f"Blocked by Zoopla for URL: {url}")
|
||||
# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
|
||||
# time.sleep(sleep_factor * 1)
|
||||
# estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
#
|
||||
# low_estimate = (
|
||||
# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
|
||||
# ) # Find all span elements with data-testid="low-estimate"
|
||||
# middle_estimate = (
|
||||
# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
|
||||
# ) # Find all span elements with data-testid="middle-estimate"
|
||||
# high_estimate = (
|
||||
# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
|
||||
# ) # Find all span elements with data-testid="high-estimate-blurred"
|
||||
#
|
||||
# return {
|
||||
# "URL": url,
|
||||
# "Low Estimate": low_estimate,
|
||||
# "Middle Estimate": middle_estimate,
|
||||
# "High Estimate": high_estimate,
|
||||
# }
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get a SAL
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
|
||||
"Standardised.xlsx",
|
||||
sheet_name="Standardised Asset List"
|
||||
)
|
||||
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
|
||||
uprns = asset_list["epc_os_uprn"].tolist()
|
||||
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
|
||||
|
||||
with Pool(processes=5) as pool:
|
||||
estimates_list = list(
|
||||
tqdm(
|
||||
pool.imap(parallel_task, urls),
|
||||
total=len(urls),
|
||||
)
|
||||
)
|
||||
|
||||
soup = sb.get_beautiful_soup()
|
||||
df = pd.DataFrame(estimates_list)
|
||||
# Extract UPRN from URL
|
||||
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
|
||||
df["valuation"] = df["Middle Estimate"].apply(parse_price)
|
||||
df.to_csv("zoopla_estimates.csv", index=False)
|
||||
|
||||
estimates = soup.find_all("div", {"data-testid": "sale-estimate"})
|
||||
# Can change the way we extract the text here
|
||||
estimate_text = (
|
||||
estimates[-1].find_all("p")[-1].find_all("span")[-1]["aria-label"]
|
||||
)
|
||||
estimate_list.append(estimate_text)
|
||||
df["uprn"] = df["uprn"].astype(int).astype(str)
|
||||
|
||||
asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
|
||||
"Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
|
||||
)
|
||||
|
|
|
|||
5
etl/webscrape/requirements.txt
Normal file
5
etl/webscrape/requirements.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
beautifulsoup4>=4.12.0
|
||||
pandas>=2.0.0
|
||||
stealth-requests>=1.0.7
|
||||
tqdm>=4.65.0
|
||||
openpyxl
|
||||
Loading…
Add table
Reference in a new issue