mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Merge pull request #636 from Hestia-Homes/eco-eligiblity-bug
boosting concurrency to 12 and adding catch for OS no data
This commit is contained in:
commit
08342dbba1
3 changed files with 159 additions and 2 deletions
|
|
@ -0,0 +1,145 @@
|
|||
# We look to match the missed properties to the UPRNS that were sent over by Peabody
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import os
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
|
||||
cleaned_uprns = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
|
||||
"Project/PeabodyPropertymatched_Dec25_propref_UPRN.xlsx"
|
||||
)
|
||||
|
||||
# Grab the problematic records
|
||||
problematic_records = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
|
||||
"Project/data_validation/to_standardise_uprns.xlsx"
|
||||
)
|
||||
# Remove dupe on Org Ref
|
||||
problematic_records = problematic_records.drop_duplicates("Org Ref")
|
||||
|
||||
df = problematic_records.merge(
|
||||
cleaned_uprns,
|
||||
left_on="Org Ref",
|
||||
right_on="reference"
|
||||
)
|
||||
|
||||
# df_had_uprn = df[~pd.isnull(df["UPRN"])]
|
||||
|
||||
# We prepare the data for analysis
|
||||
df["landlord_property_id"] = df["Org Ref"].copy()
|
||||
df["domna_property_id"] = df["Org Ref"].copy()
|
||||
|
||||
df = df.rename(
|
||||
columns={
|
||||
"Address 1": "domna_address_1",
|
||||
"Postcode": "postcode",
|
||||
"Type": "landlord_property_type",
|
||||
"Attachment": "landlord_built_form",
|
||||
"Heating": "landlord_heating_system",
|
||||
"out_uprn": "epc_os_uprn"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def make_full_address(x):
|
||||
to_join = [x['domna_address_1'], x['Address 2'], x['Address 3']]
|
||||
to_join = [x for x in to_join if not pd.isnull(x) and x != '']
|
||||
return ", ".join(to_join)
|
||||
|
||||
|
||||
df["domna_full_address"] = df.apply(lambda x: make_full_address(x), axis=1)
|
||||
|
||||
df = df[
|
||||
[
|
||||
"domna_address_1", "Address 2", "Address 3", "postcode", "landlord_property_type",
|
||||
"landlord_built_form", "landlord_heating_system", "epc_os_uprn", "Total Floor Area (m2)",
|
||||
"domna_property_id", "domna_full_address"
|
||||
]
|
||||
]
|
||||
|
||||
df["landlord_built_form"] = df["landlord_built_form"].map(
|
||||
{
|
||||
"MidTerrace": "Mid-Terrace",
|
||||
"EndTerrace": "End-Terrace",
|
||||
"SemiDetached": "Semi-Detached",
|
||||
"Detached": "Detached",
|
||||
"EnclosedEndTerrace": "Enclosed End-Terrace",
|
||||
"EnclosedMidTerrace": "Enclosed Mid-Terrace",
|
||||
}
|
||||
)
|
||||
|
||||
# We have a lot of dupes - remove them
|
||||
df["epc_os_uprn"].duplicated().sum()
|
||||
|
||||
dupe_uprns = df[df["epc_os_uprn"].duplicated()]["epc_os_uprn"].values
|
||||
dupe_df = df[df["epc_os_uprn"].isin(dupe_uprns)]
|
||||
dupe_df = dupe_df.sort_values("epc_os_uprn", ascending=True)
|
||||
# Remove clear duplicate UPRNs because of unreliability
|
||||
df = df[~df["epc_os_uprn"].isin(dupe_uprns)]
|
||||
|
||||
filename = (
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/20260105 - additional "
|
||||
"UPRNS.xlsx"
|
||||
)
|
||||
with pd.ExcelWriter(filename) as writer:
|
||||
df.to_excel(writer, sheet_name="Standardised Asset List", index=False)
|
||||
|
||||
# Check these are valid
|
||||
# We check UPRN validity against our OS data
|
||||
# uprn_filenames = read_dataframe_from_s3_parquet(
|
||||
# bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
|
||||
# )
|
||||
#
|
||||
# # We're going to:
|
||||
# # 1) Grab a filename
|
||||
# # 2) Read it in
|
||||
# # 3) Check which UPRNS from our data are in that file
|
||||
# # 4) Keep a record of which UPRNS were found where
|
||||
#
|
||||
# for uprn_file in tqdm(uprn_filenames['filenames'].values, total=len(uprn_filenames)):
|
||||
# spatial_data = read_dataframe_from_s3_parquet(
|
||||
# bucket_name="retrofit-data-dev", file_key=f"spatial/{uprn_file}"
|
||||
# )
|
||||
#
|
||||
# uprns_in_file = df[
|
||||
# df['out_uprn'].astype('Int64').isin(spatial_data['UPRN'].astype('Int64').values)
|
||||
# ].copy()
|
||||
#
|
||||
# print("Found {} UPRNS in file {}".format(len(uprns_in_file), uprn_file))
|
||||
# if len(uprns_in_file) > 0:
|
||||
# # Store the found UPRNS in the validation cache
|
||||
# data_to_store = uprns_in_file[["Org Ref", "UPRN"]].copy()
|
||||
# data_to_store["Source File"] = uprn_file
|
||||
# # Store
|
||||
# data_to_store.to_csv(
|
||||
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
|
||||
# f"Project/data_validation/missing_uprn_validation_cache/{uprn_file.split('.parquet')[0]}_found_uprns.csv",
|
||||
# index=False
|
||||
# )
|
||||
#
|
||||
# storage_locations = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
|
||||
# "Project/data_validation/missing_uprn_validation_cache")
|
||||
# # List contents
|
||||
# folder_contents = os.listdir(storage_locations)
|
||||
# # Grab files and concatenate
|
||||
# all_found_uprns = []
|
||||
# for file in folder_contents:
|
||||
# if file.endswith("_found_uprns.csv"):
|
||||
# df = pd.read_csv(os.path.join(storage_locations, file))
|
||||
# all_found_uprns.append(df)
|
||||
#
|
||||
# all_found_uprns = pd.concat(all_found_uprns)
|
||||
#
|
||||
# invalid = df[
|
||||
# ~df["Org Ref"].isin(all_found_uprns["Org Ref"].values)
|
||||
# ]
|
||||
#
|
||||
# uprn_example = 10095401237
|
||||
# eg = uprn_filenames[
|
||||
# (uprn_filenames["upper"] >= uprn_example) & (uprn_filenames["lower"] <= uprn_example)
|
||||
# ]
|
||||
# eg2 = read_dataframe_from_s3_parquet(
|
||||
# bucket_name="retrofit-data-dev", file_key=f"spatial/{eg['filenames'].values[0]}"
|
||||
# )
|
||||
#
|
||||
# eg2[eg2["UPRN"] == uprn_example]
|
||||
|
|
@ -150,9 +150,21 @@ class OpenUprnClient:
|
|||
)
|
||||
|
||||
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
|
||||
# If this is empty, we get the nearest property
|
||||
|
||||
for p in input_properties:
|
||||
if p.uprn in associated_uprn:
|
||||
p.set_spatial(spatial_df[spatial_df["UPRN"] == p.uprn])
|
||||
p_spatial_df = spatial_df[spatial_df["UPRN"] == p.uprn]
|
||||
if p_spatial_df.empty:
|
||||
# Backup method - take the closest UPRN as a proxy
|
||||
logger.info("Ordnance survey not found - faking the cloest property for a best estimation")
|
||||
p_spatial_df = spatial_data.loc[
|
||||
(spatial_data["UPRN"] - p.uprn).abs().idxmin()
|
||||
].copy()
|
||||
p_spatial_df["LATITUDE"], p_spatial_df["LONGITUDE"] = None, None
|
||||
p_spatial_df = p_spatial_df.to_frame().T
|
||||
|
||||
p.set_spatial(p_spatial_df)
|
||||
|
||||
if p.uprn_source == SearchEpc.UPRN_SOURCE_SIMULATED:
|
||||
p.set_spatial(cls.empty_spatial_df())
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ functions:
|
|||
- sqs:
|
||||
arn: arn:aws:sqs:${self:provider.region}:${aws:accountId}:model-engine-queue
|
||||
batchSize: 1
|
||||
maximumConcurrency: 10 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits
|
||||
maximumConcurrency: 12 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits
|
||||
|
||||
|
||||
resources:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue