diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/e_additional_uprns.py b/etl/customers/peabody/Nov 2025 Consulting Project/e_additional_uprns.py new file mode 100644 index 00000000..7b7ab5ac --- /dev/null +++ b/etl/customers/peabody/Nov 2025 Consulting Project/e_additional_uprns.py @@ -0,0 +1,145 @@ +# We look to match the missed properties to the UPRNS that were sent over by Peabody +from tqdm import tqdm +import pandas as pd +import os +from utils.s3 import read_dataframe_from_s3_parquet + +cleaned_uprns = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/PeabodyPropertymatched_Dec25_propref_UPRN.xlsx" +) + +# Grab the problematic records +problematic_records = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation/to_standardise_uprns.xlsx" +) +# Remove dupe on Org Ref +problematic_records = problematic_records.drop_duplicates("Org Ref") + +df = problematic_records.merge( + cleaned_uprns, + left_on="Org Ref", + right_on="reference" +) + +# df_had_uprn = df[~pd.isnull(df["UPRN"])] + +# We prepare the data for analysis +df["landlord_property_id"] = df["Org Ref"].copy() +df["domna_property_id"] = df["Org Ref"].copy() + +df = df.rename( + columns={ + "Address 1": "domna_address_1", + "Postcode": "postcode", + "Type": "landlord_property_type", + "Attachment": "landlord_built_form", + "Heating": "landlord_heating_system", + "out_uprn": "epc_os_uprn" + } +) + + +def make_full_address(x): + to_join = [x['domna_address_1'], x['Address 2'], x['Address 3']] + to_join = [x for x in to_join if not pd.isnull(x) and x != ''] + return ", ".join(to_join) + + +df["domna_full_address"] = df.apply(lambda x: make_full_address(x), axis=1) + +df = df[ + [ + "domna_address_1", "Address 2", "Address 3", "postcode", "landlord_property_type", + "landlord_built_form", "landlord_heating_system", "epc_os_uprn", "Total Floor Area (m2)", + "domna_property_id", "domna_full_address" + ] +] + +df["landlord_built_form"] = df["landlord_built_form"].map( + { + "MidTerrace": "Mid-Terrace", + "EndTerrace": "End-Terrace", + "SemiDetached": "Semi-Detached", + "Detached": "Detached", + "EnclosedEndTerrace": "Enclosed End-Terrace", + "EnclosedMidTerrace": "Enclosed Mid-Terrace", + } +) + +# We have a lot of dupes - remove them +df["epc_os_uprn"].duplicated().sum() + +dupe_uprns = df[df["epc_os_uprn"].duplicated()]["epc_os_uprn"].values +dupe_df = df[df["epc_os_uprn"].isin(dupe_uprns)] +dupe_df = dupe_df.sort_values("epc_os_uprn", ascending=True) +# Remove clear duplicate UPRNs because of unreliability +df = df[~df["epc_os_uprn"].isin(dupe_uprns)] + +filename = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/20260105 - additional " + "UPRNS.xlsx" +) +with pd.ExcelWriter(filename) as writer: + df.to_excel(writer, sheet_name="Standardised Asset List", index=False) + +# Check these are valid +# We check UPRN validity against our OS data +# uprn_filenames = read_dataframe_from_s3_parquet( +# bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet" +# ) +# +# # We're going to: +# # 1) Grab a filename +# # 2) Read it in +# # 3) Check which UPRNS from our data are in that file +# # 4) Keep a record of which UPRNS were found where +# +# for uprn_file in tqdm(uprn_filenames['filenames'].values, total=len(uprn_filenames)): +# spatial_data = read_dataframe_from_s3_parquet( +# bucket_name="retrofit-data-dev", file_key=f"spatial/{uprn_file}" +# ) +# +# uprns_in_file = df[ +# df['out_uprn'].astype('Int64').isin(spatial_data['UPRN'].astype('Int64').values) +# ].copy() +# +# print("Found {} UPRNS in file {}".format(len(uprns_in_file), uprn_file)) +# if len(uprns_in_file) > 0: +# # Store the found UPRNS in the validation cache +# data_to_store = uprns_in_file[["Org Ref", "UPRN"]].copy() +# data_to_store["Source File"] = uprn_file +# # Store +# data_to_store.to_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " +# f"Project/data_validation/missing_uprn_validation_cache/{uprn_file.split('.parquet')[0]}_found_uprns.csv", +# index=False +# ) +# +# storage_locations = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " +# "Project/data_validation/missing_uprn_validation_cache") +# # List contents +# folder_contents = os.listdir(storage_locations) +# # Grab files and concatenate +# all_found_uprns = [] +# for file in folder_contents: +# if file.endswith("_found_uprns.csv"): +# df = pd.read_csv(os.path.join(storage_locations, file)) +# all_found_uprns.append(df) +# +# all_found_uprns = pd.concat(all_found_uprns) +# +# invalid = df[ +# ~df["Org Ref"].isin(all_found_uprns["Org Ref"].values) +# ] +# +# uprn_example = 10095401237 +# eg = uprn_filenames[ +# (uprn_filenames["upper"] >= uprn_example) & (uprn_filenames["lower"] <= uprn_example) +# ] +# eg2 = read_dataframe_from_s3_parquet( +# bucket_name="retrofit-data-dev", file_key=f"spatial/{eg['filenames'].values[0]}" +# ) +# +# eg2[eg2["UPRN"] == uprn_example] diff --git a/etl/spatial/OpenUprnClient.py b/etl/spatial/OpenUprnClient.py index 36cf2d7b..8cef80b1 100644 --- a/etl/spatial/OpenUprnClient.py +++ b/etl/spatial/OpenUprnClient.py @@ -150,9 +150,21 @@ class OpenUprnClient: ) spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)] + # If this is empty, we get the nearest property + for p in input_properties: if p.uprn in associated_uprn: - p.set_spatial(spatial_df[spatial_df["UPRN"] == p.uprn]) + p_spatial_df = spatial_df[spatial_df["UPRN"] == p.uprn] + if p_spatial_df.empty: + # Backup method - take the closest UPRN as a proxy + logger.info("Ordnance survey not found - faking the cloest property for a best estimation") + p_spatial_df = spatial_data.loc[ + (spatial_data["UPRN"] - p.uprn).abs().idxmin() + ].copy() + p_spatial_df["LATITUDE"], p_spatial_df["LONGITUDE"] = None, None + p_spatial_df = p_spatial_df.to_frame().T + + p.set_spatial(p_spatial_df) if p.uprn_source == SearchEpc.UPRN_SOURCE_SIMULATED: p.set_spatial(cls.empty_spatial_df()) diff --git a/serverless.yml b/serverless.yml index 38d8da89..f3def028 100644 --- a/serverless.yml +++ b/serverless.yml @@ -66,7 +66,7 @@ functions: - sqs: arn: arn:aws:sqs:${self:provider.region}:${aws:accountId}:model-engine-queue batchSize: 1 - maximumConcurrency: 10 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits + maximumConcurrency: 12 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits resources: