Merge pull request #636 from Hestia-Homes/eco-eligiblity-bug

boosting concurrency to 12 and adding catch for OS no data
2026-07-27 23:35:01 +00:00 · 2026-01-06 02:10:59 +08:00 · 2026-01-06 02:10:59 +08:00 · 08342dbba1
commit 08342dbba1
parent 59c3c2c7a3 eb347a4dfe
3 changed files with 159 additions and 2 deletions
--- a/Project/e_additional_uprns.py
+++ b/Project/e_additional_uprns.py
@ -0,0 +1,145 @@
+# We look to match the missed properties to the UPRNS that were sent over by Peabody
+from tqdm import tqdm
+import pandas as pd
+import os
+from utils.s3 import read_dataframe_from_s3_parquet
+
+cleaned_uprns = pd.read_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
+    "Project/PeabodyPropertymatched_Dec25_propref_UPRN.xlsx"
+)
+
+# Grab the problematic records
+problematic_records = pd.read_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
+    "Project/data_validation/to_standardise_uprns.xlsx"
+)
+# Remove dupe on Org Ref
+problematic_records = problematic_records.drop_duplicates("Org Ref")
+
+df = problematic_records.merge(
+    cleaned_uprns,
+    left_on="Org Ref",
+    right_on="reference"
+)
+
+# df_had_uprn = df[~pd.isnull(df["UPRN"])]
+
+# We prepare the data for analysis
+df["landlord_property_id"] = df["Org Ref"].copy()
+df["domna_property_id"] = df["Org Ref"].copy()
+
+df = df.rename(
+    columns={
+        "Address 1": "domna_address_1",
+        "Postcode": "postcode",
+        "Type": "landlord_property_type",
+        "Attachment": "landlord_built_form",
+        "Heating": "landlord_heating_system",
+        "out_uprn": "epc_os_uprn"
+    }
+)
+
+
+def make_full_address(x):
+    to_join = [x['domna_address_1'], x['Address 2'], x['Address 3']]
+    to_join = [x for x in to_join if not pd.isnull(x) and x != '']
+    return ", ".join(to_join)
+
+
+df["domna_full_address"] = df.apply(lambda x: make_full_address(x), axis=1)
+
+df = df[
+    [
+        "domna_address_1", "Address 2", "Address 3", "postcode", "landlord_property_type",
+        "landlord_built_form", "landlord_heating_system", "epc_os_uprn", "Total Floor Area (m2)",
+        "domna_property_id", "domna_full_address"
+    ]
+]
+
+df["landlord_built_form"] = df["landlord_built_form"].map(
+    {
+        "MidTerrace": "Mid-Terrace",
+        "EndTerrace": "End-Terrace",
+        "SemiDetached": "Semi-Detached",
+        "Detached": "Detached",
+        "EnclosedEndTerrace": "Enclosed End-Terrace",
+        "EnclosedMidTerrace": "Enclosed Mid-Terrace",
+    }
+)
+
+# We have a lot of dupes - remove them
+df["epc_os_uprn"].duplicated().sum()
+
+dupe_uprns = df[df["epc_os_uprn"].duplicated()]["epc_os_uprn"].values
+dupe_df = df[df["epc_os_uprn"].isin(dupe_uprns)]
+dupe_df = dupe_df.sort_values("epc_os_uprn", ascending=True)
+# Remove clear duplicate UPRNs because of unreliability
+df = df[~df["epc_os_uprn"].isin(dupe_uprns)]
+
+filename = (
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/20260105 - additional "
+    "UPRNS.xlsx"
+)
+with pd.ExcelWriter(filename) as writer:
+    df.to_excel(writer, sheet_name="Standardised Asset List", index=False)
+
+# Check these are valid
+# We check UPRN validity against our OS data
+# uprn_filenames = read_dataframe_from_s3_parquet(
+#     bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
+# )
+#
+# # We're going to:
+# # 1) Grab a filename
+# # 2) Read it in
+# # 3) Check which UPRNS from our data are in that file
+# # 4) Keep a record of which UPRNS were found where
+#
+# for uprn_file in tqdm(uprn_filenames['filenames'].values, total=len(uprn_filenames)):
+#     spatial_data = read_dataframe_from_s3_parquet(
+#         bucket_name="retrofit-data-dev", file_key=f"spatial/{uprn_file}"
+#     )
+#
+#     uprns_in_file = df[
+#         df['out_uprn'].astype('Int64').isin(spatial_data['UPRN'].astype('Int64').values)
+#     ].copy()
+#
+#     print("Found {} UPRNS in file {}".format(len(uprns_in_file), uprn_file))
+#     if len(uprns_in_file) > 0:
+#         # Store the found UPRNS in the validation cache
+#         data_to_store = uprns_in_file[["Org Ref", "UPRN"]].copy()
+#         data_to_store["Source File"] = uprn_file
+#         # Store
+#         data_to_store.to_csv(
+#             "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
+#             f"Project/data_validation/missing_uprn_validation_cache/{uprn_file.split('.parquet')[0]}_found_uprns.csv",
+#             index=False
+#         )
+#
+# storage_locations = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
+#                      "Project/data_validation/missing_uprn_validation_cache")
+# # List contents
+# folder_contents = os.listdir(storage_locations)
+# # Grab files and concatenate
+# all_found_uprns = []
+# for file in folder_contents:
+#     if file.endswith("_found_uprns.csv"):
+#         df = pd.read_csv(os.path.join(storage_locations, file))
+#         all_found_uprns.append(df)
+#
+# all_found_uprns = pd.concat(all_found_uprns)
+#
+# invalid = df[
+#     ~df["Org Ref"].isin(all_found_uprns["Org Ref"].values)
+# ]
+#
+# uprn_example = 10095401237
+# eg = uprn_filenames[
+#     (uprn_filenames["upper"] >= uprn_example) & (uprn_filenames["lower"] <= uprn_example)
+#     ]
+# eg2 = read_dataframe_from_s3_parquet(
+#     bucket_name="retrofit-data-dev", file_key=f"spatial/{eg['filenames'].values[0]}"
+# )
+#
+# eg2[eg2["UPRN"] == uprn_example]
--- a/etl/spatial/OpenUprnClient.py
+++ b/etl/spatial/OpenUprnClient.py
@ -150,9 +150,21 @@ class OpenUprnClient:
            )

            spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
+            # If this is empty, we get the nearest property
+
            for p in input_properties:
                if p.uprn in associated_uprn:
-                    p.set_spatial(spatial_df[spatial_df["UPRN"] == p.uprn])
+                    p_spatial_df = spatial_df[spatial_df["UPRN"] == p.uprn]
+                    if p_spatial_df.empty:
+                        # Backup method - take the closest UPRN as a proxy
+                        logger.info("Ordnance survey not found - faking the cloest property for a best estimation")
+                        p_spatial_df = spatial_data.loc[
+                            (spatial_data["UPRN"] - p.uprn).abs().idxmin()
+                        ].copy()
+                        p_spatial_df["LATITUDE"], p_spatial_df["LONGITUDE"] = None, None
+                        p_spatial_df = p_spatial_df.to_frame().T
+
+                    p.set_spatial(p_spatial_df)

                if p.uprn_source == SearchEpc.UPRN_SOURCE_SIMULATED:
                    p.set_spatial(cls.empty_spatial_df())
--- a/serverless.yml
+++ b/serverless.yml
@ -66,7 +66,7 @@ functions:
      - sqs:
          arn: arn:aws:sqs:${self:provider.region}:${aws:accountId}:model-engine-queue
          batchSize: 1
-          maximumConcurrency: 10  # Heavily restricts concurrency to avoid overwhelming the ldmbda limits
+          maximumConcurrency: 12  # Heavily restricts concurrency to avoid overwhelming the ldmbda limits


 resources: