Merge pull request #636 from Hestia-Homes/eco-eligiblity-bug

boosting concurrency to 12 and adding catch for OS no data
This commit is contained in:
KhalimCK 2026-01-06 02:10:59 +08:00 committed by GitHub
commit 08342dbba1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 159 additions and 2 deletions

View file

@ -0,0 +1,145 @@
# We look to match the missed properties to the UPRNS that were sent over by Peabody
from tqdm import tqdm
import pandas as pd
import os
from utils.s3 import read_dataframe_from_s3_parquet
cleaned_uprns = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
"Project/PeabodyPropertymatched_Dec25_propref_UPRN.xlsx"
)
# Grab the problematic records
problematic_records = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
"Project/data_validation/to_standardise_uprns.xlsx"
)
# Remove dupe on Org Ref
problematic_records = problematic_records.drop_duplicates("Org Ref")
df = problematic_records.merge(
cleaned_uprns,
left_on="Org Ref",
right_on="reference"
)
# df_had_uprn = df[~pd.isnull(df["UPRN"])]
# We prepare the data for analysis
df["landlord_property_id"] = df["Org Ref"].copy()
df["domna_property_id"] = df["Org Ref"].copy()
df = df.rename(
columns={
"Address 1": "domna_address_1",
"Postcode": "postcode",
"Type": "landlord_property_type",
"Attachment": "landlord_built_form",
"Heating": "landlord_heating_system",
"out_uprn": "epc_os_uprn"
}
)
def make_full_address(x):
to_join = [x['domna_address_1'], x['Address 2'], x['Address 3']]
to_join = [x for x in to_join if not pd.isnull(x) and x != '']
return ", ".join(to_join)
df["domna_full_address"] = df.apply(lambda x: make_full_address(x), axis=1)
df = df[
[
"domna_address_1", "Address 2", "Address 3", "postcode", "landlord_property_type",
"landlord_built_form", "landlord_heating_system", "epc_os_uprn", "Total Floor Area (m2)",
"domna_property_id", "domna_full_address"
]
]
df["landlord_built_form"] = df["landlord_built_form"].map(
{
"MidTerrace": "Mid-Terrace",
"EndTerrace": "End-Terrace",
"SemiDetached": "Semi-Detached",
"Detached": "Detached",
"EnclosedEndTerrace": "Enclosed End-Terrace",
"EnclosedMidTerrace": "Enclosed Mid-Terrace",
}
)
# We have a lot of dupes - remove them
df["epc_os_uprn"].duplicated().sum()
dupe_uprns = df[df["epc_os_uprn"].duplicated()]["epc_os_uprn"].values
dupe_df = df[df["epc_os_uprn"].isin(dupe_uprns)]
dupe_df = dupe_df.sort_values("epc_os_uprn", ascending=True)
# Remove clear duplicate UPRNs because of unreliability
df = df[~df["epc_os_uprn"].isin(dupe_uprns)]
filename = (
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/20260105 - additional "
"UPRNS.xlsx"
)
with pd.ExcelWriter(filename) as writer:
df.to_excel(writer, sheet_name="Standardised Asset List", index=False)
# Check these are valid
# We check UPRN validity against our OS data
# uprn_filenames = read_dataframe_from_s3_parquet(
# bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
# )
#
# # We're going to:
# # 1) Grab a filename
# # 2) Read it in
# # 3) Check which UPRNS from our data are in that file
# # 4) Keep a record of which UPRNS were found where
#
# for uprn_file in tqdm(uprn_filenames['filenames'].values, total=len(uprn_filenames)):
# spatial_data = read_dataframe_from_s3_parquet(
# bucket_name="retrofit-data-dev", file_key=f"spatial/{uprn_file}"
# )
#
# uprns_in_file = df[
# df['out_uprn'].astype('Int64').isin(spatial_data['UPRN'].astype('Int64').values)
# ].copy()
#
# print("Found {} UPRNS in file {}".format(len(uprns_in_file), uprn_file))
# if len(uprns_in_file) > 0:
# # Store the found UPRNS in the validation cache
# data_to_store = uprns_in_file[["Org Ref", "UPRN"]].copy()
# data_to_store["Source File"] = uprn_file
# # Store
# data_to_store.to_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
# f"Project/data_validation/missing_uprn_validation_cache/{uprn_file.split('.parquet')[0]}_found_uprns.csv",
# index=False
# )
#
# storage_locations = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
# "Project/data_validation/missing_uprn_validation_cache")
# # List contents
# folder_contents = os.listdir(storage_locations)
# # Grab files and concatenate
# all_found_uprns = []
# for file in folder_contents:
# if file.endswith("_found_uprns.csv"):
# df = pd.read_csv(os.path.join(storage_locations, file))
# all_found_uprns.append(df)
#
# all_found_uprns = pd.concat(all_found_uprns)
#
# invalid = df[
# ~df["Org Ref"].isin(all_found_uprns["Org Ref"].values)
# ]
#
# uprn_example = 10095401237
# eg = uprn_filenames[
# (uprn_filenames["upper"] >= uprn_example) & (uprn_filenames["lower"] <= uprn_example)
# ]
# eg2 = read_dataframe_from_s3_parquet(
# bucket_name="retrofit-data-dev", file_key=f"spatial/{eg['filenames'].values[0]}"
# )
#
# eg2[eg2["UPRN"] == uprn_example]

View file

@ -150,9 +150,21 @@ class OpenUprnClient:
)
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
# If this is empty, we get the nearest property
for p in input_properties:
if p.uprn in associated_uprn:
p.set_spatial(spatial_df[spatial_df["UPRN"] == p.uprn])
p_spatial_df = spatial_df[spatial_df["UPRN"] == p.uprn]
if p_spatial_df.empty:
# Backup method - take the closest UPRN as a proxy
logger.info("Ordnance survey not found - faking the cloest property for a best estimation")
p_spatial_df = spatial_data.loc[
(spatial_data["UPRN"] - p.uprn).abs().idxmin()
].copy()
p_spatial_df["LATITUDE"], p_spatial_df["LONGITUDE"] = None, None
p_spatial_df = p_spatial_df.to_frame().T
p.set_spatial(p_spatial_df)
if p.uprn_source == SearchEpc.UPRN_SOURCE_SIMULATED:
p.set_spatial(cls.empty_spatial_df())

View file

@ -66,7 +66,7 @@ functions:
- sqs:
arn: arn:aws:sqs:${self:provider.region}:${aws:accountId}:model-engine-queue
batchSize: 1
maximumConcurrency: 10 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits
maximumConcurrency: 12 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits
resources: