improving SearchEpc matching algorithm

2026-08-02 21:08:24 +00:00 · 2024-08-27 14:43:18 +01:00 · 2024-08-27 14:43:18 +01:00 · 4c71342cfb
commit 4c71342cfb
parent f122ae3269
2 changed files with 257 additions and 23 deletions
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -292,8 +292,7 @@ class SearchEpc:
                        "error": str(e)
                    }

-    @staticmethod
-    def filter_rows(rows, property_type=None, address=None):
+    def filter_rows(self, rows, property_type=None, address=None):
        """
        This method should not be used when property_type and address are both not None
        :param rows:
@ -321,7 +320,18 @@ class SearchEpc:

        if address is not None:
            # We can do a filter on the property type
-            best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
+            # We check if the full address contains the postcode and if it does, remove
+            if self.postcode in address:
+                address = address.replace(self.postcode, "").strip().rstrip(",")
+
+            # We check if post town is included in the address
+            if any([r["posttown"].lower() in address.lower() for r in rows]):
+                best_match = process.extractOne(
+                    address, [", ".join([r["address"], r["posttown"]]) for r in rows], score_cutoff=0
+                )
+            else:
+                best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
+            # Get all of the scores
            rows_filtered = [r for r in rows if r["address"] == best_match[0]]

            if rows_filtered:
--- a/etl/customers/orbit/archetypes.py
+++ b/etl/customers/orbit/archetypes.py
@ -1,4 +1,24 @@
 import pandas as pd
+import numpy as np
+from backend.SearchEpc import SearchEpc
+from dotenv import load_dotenv
+from tqdm import tqdm
+import os
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def clean_colnames(df):
+    secondary_cols = ["" if pd.isnull(x) else x for x in df.iloc[0, :].values]
+    new_colnames = [
+        "+".join([df.columns[i], secondary_cols[i]]) if secondary_cols[i] else df.columns[i]
+        for i, c in enumerate(df.columns)
+    ]
+    # Drop row 0
+    df = df.drop(0)
+    df.columns = new_colnames
+    return df


 def main():
@ -8,33 +28,237 @@ def main():
    :return:
    """

+    all_locations = [
+        "Forest Road Erith",
+        "Lesney Farms",
+        "Brook Street 155 - 243",
+        "Hazel Drive",
+        "Page Crescent",
+        "Brook Salmon Roberts and Chapma",
+        "Beacon Road"
+    ]
+
    all_assets = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley Wave 3 Project - external - "
        "reduced.xlsx",
        sheet_name="Full Property List",
        header=1
    )
-
-    secondary_cols = ["" if pd.isnull(x) else x for x in all_assets.iloc[0, :].values]
-    new_colnames = [
-        "+".join([all_assets.columns[i], secondary_cols[i]]) if secondary_cols[i] else all_assets.columns[i]
-        for i, c in enumerate(all_assets.columns)
-    ]
-    # Drop row 0
+    all_assets = clean_colnames(all_assets)
+    all_assets["Location"] = None

    locations = {
-        location_name: pd.read_excel(
+        location_name: clean_colnames(pd.read_excel(
            "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley Wave 3 Project - external - "
-            "reduced"
-            ".xlsx",
-            sheet_name=location_name
-        ) for location_name in [
-            "Forest Road Erith",
-            "Lesney Farms",
-            "Brook Street 155 - 243",
-            "Hazel Drive",
-            "Page Crescent",
-            "Brook Salmon Roberts and Chapma",
-            "Beacon Road"
-        ]
+            "reduced.xlsx",
+            sheet_name=location_name,
+            header=1
+        )) for location_name in all_locations
+    }
+
+    for loc in all_locations:
+        all_assets["Location"] = np.where(
+            all_assets["Asset Reference"].isin(locations[loc]["Asset Reference"]),
+            loc,
+            all_assets["Location"]
+        )
+
+    if pd.isnull(all_assets["Location"]).sum():
+        raise Exception("something went wrong")
+
+    # 234 properties below EPC C
+    below_epc_c = all_assets[all_assets["PRE CALCULATED EPC"].isin(["D", "E", "F", "G"])].copy()
+
+    # We simplify wall type
+    below_epc_c["wall_type_simplified"] = below_epc_c["Wall Type"].str.split(" ").str[0]
+
+    known_no_epc = [
+        28679,  # These is no EPC for 11 Page Crescent, Erith, Kent, DA8 2HJ, just 11A
+        29291,  # No EPC for 225 Slade Green Road, Erith, Kent, DA8 2JW
+    ]
+    # Get the EPC data
+    epc_data = []
+    for _, home in tqdm(all_assets.iterrows(), total=len(all_assets)):
+        if home["Asset Reference"] in known_no_epc:
+            continue
+
+        address = home["Address"]
+        # Spelling error
+        if "Frinstead" in address:
+            address = address.replace("Frinstead", "Frinsted")
+
+        address1 = address.split(",")[0]
+
+        searcher = SearchEpc(
+            address1=address1,
+            postcode=home["Address - Postcode"],
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key="",
+            full_address=address,
+        )
+        searcher.ordnance_survey_client.property_type = None
+        searcher.ordnance_survey_client.built_form = None
+
+        searcher.find_property(skip_os=True)
+        if searcher.newest_epc is None:
+            raise Exception("Couldn't find")
+
+        epc_data.append(
+            {
+                "Asset Reference": home["Asset Reference"],
+                **searcher.newest_epc.copy()
+            }
+        )
+
+    epc_data = pd.DataFrame(epc_data)
+    # epc_data.to_csv(
+    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Orbit - Wates/Bexley EPC data.csv", index=False
+    # )
+
+    epc_comparison = all_assets[
+        ['Asset Reference', 'Address', 'PRE CALCULATED EPC']
+    ].merge(
+        epc_data[["Asset Reference", "current-energy-rating", "lodgement-date"]],
+        on='Asset Reference',
+        how="left"
+    )
+
+    # There are a large # of properties (147) that have different pre calcualted EPC rating, to what's on the registry
+    # These may be internally held EPRs but this may inform which properties we might want to prioritise for survey
+    different_epcs = epc_comparison[
+        epc_comparison["PRE CALCULATED EPC"] != epc_comparison["current-energy-rating"]
+        ]
+
+    not_c = different_epcs[
+        (different_epcs["PRE CALCULATED EPC"] == "C") &
+        (different_epcs["current-energy-rating"] != "C")
+        ]
+
+    system_builds = below_epc_c[
+        below_epc_c["Wall Type"].str.contains("SystemBuilt")
+    ].copy()
+
+    combinations = system_builds[
+        ['Asset Type', 'Property Type', 'Location', 'PRE CALCULATED EPC', 'Wall Type', ]
+    ].drop_duplicates()
+
+    system_build_data_comparison = system_builds.merge(
+        epc_data[["Asset Reference", "walls-description", "roof-description", "current-energy-rating"]],
+        left_on='Asset Reference',
+        right_on='Asset Reference',
+        how="left"
+    )
+
+    system_build_data_comparison["PRE CALCULATED EPC"].value_counts()
+    system_build_data_comparison["current-energy-rating"].value_counts()
+
+    epc_cs_system_builds = system_build_data_comparison[system_build_data_comparison["current-energy-rating"] == "C"]
+
+    archetype_columns = [
+        ["Asset Type", "Property Type", "Wall Type", "Location"],
+        ["Asset Type", "Property Type", "Location"],
+        ["Asset Type", "Property Type", "Wall Type", "Location", "PRE CALCULATED EPC", "roof-description"],
+        ["Asset Type", "Property Type", "Location", "PRE CALCULATED EPC"]
+    ]
+
+    summary = []
+    for cols in archetype_columns:
+        combinations = system_build_data_comparison[cols].drop_duplicates()
+        summary.append(
+            {
+                "cols": cols,
+                "number_archetypes": len(combinations),
+            }
+        )
+
+    summary = pd.DataFrame(summary)
+
+    # Let's use this column combination
+    chosen_combination = [
+        "Asset Type", "Property Type", "Wall Type", "Location", "PRE CALCULATED EPC", "roof-description"
+    ]
+
+    # For this combination, let's find the properties
+    archetype_combinations = system_build_data_comparison[chosen_combination].drop_duplicates().reset_index(drop=True)
+    archetype_combinations["archetype ID"] = archetype_combinations.index
+
+    archetyped_data = system_build_data_comparison.merge(
+        archetype_combinations, how="left", on=chosen_combination
+    )
+
+    counts = archetyped_data["archetype ID"].value_counts()
+    # Archetype 0: Semi D, Uninsulated system built, Pre calculated EPC D, flat insulated roof, (Lesney-0)
+    # Archetype 1: Semi D, Externally insulated system built, Pre calculated EPC D, flat insulated roof (Lesney-1)
+    # Archetype 5: Semi D, System built with unknown insulation, Pre calculated EPC D, flat roof insulated (Lesney-2)
+    # Archetype 3: Semi D, Externally insulated system built, Pre calculated EPC D, flat roof uninsulated (assumed) (
+    # Lesney-3)
+    # 0     21
+    # 1     10
+    # 5     10
+    # 3      3
+    # 2      1
+    # 4      1
+    # 6      1
+    # 7      1
+    # 8      1
+    # 9      1
+    # 10     1
+    # 11     1
+
+    # This archetype is the same as 0, apart from the pre calculate EPC being an E. The registry says this is a D
+    # This has been added to additonal units
+    eg1 = archetyped_data[archetyped_data["archetype ID"] == 2]
+
+    # This archetype is the same as 3, apart from it having limited flat roof insulation.
+    # TODO: The insulation status of this property should be confirmed
+    eg2 = archetyped_data[archetyped_data["archetype ID"] == 4]
+    eg2["roof-description"]
+    z = epc_data[epc_data["Asset Reference"] == eg2["Asset Reference"].values[0]]
+
+    # This is the one mid-terrace - the EPC data indicates that this is Semi-detached
+    # Otherwise this is archetype 5
+    # this should be semi-detached
+    eg3 = archetyped_data[archetyped_data["archetype ID"] == 6]
+    eg3_epc_data = epc_data[epc_data["Asset Reference"] == eg3["Asset Reference"].values[0]]
+
+    # This warrants its own archetype
+    # Semi D, System built with unknown insulation, Pre calculated EPC D, flat uninsulated roof
+    eg4 = archetyped_data[archetyped_data["archetype ID"] == 7]
+
+    # This property stands out due to the mixed cavity and system built wall, but besides that it's similar to
+    # archetype 0
+    # The latest EPC agrees that this is a mixed wall type but the EPC suggests solid and cavity, with an assumed
+    # insulated cavity, as built
+    eg5 = archetyped_data[archetyped_data["archetype ID"] == 8]
+
+    # Archetypes 9, 10, 11 are all similar, Semi D, Uninsulated system built, with pitched lofts with up to 200mm
+    # insulation in the lofts
+    eg6 = archetyped_data[archetyped_data["archetype ID"] == 9]
+
+    # It's just the three units
+    # They're all labelled as
+    pitched_system_built_properties = archetyped_data[archetyped_data["archetype ID"].isin([9, 10, 11])]
+    pitched_system_built_properties["Address"]
+
+    notes = [
+        {
+            "Asset Reference": 27445,
+            "note": "Confirmed this has a pitched roof on Maps"
+        },
+        {
+            "Asset Reference": 27443,
+            "note": "Confirmed this has a pitched roof on Maps"
+        },
+        {
+            "Asset Reference": 27442,
+            "note": "Confirmed this has a pitched roof on Maps"
+        },
+        {
+            "Asset Reference": 25847,
+            "note": "This is labelled as a mid-terrace but the EPC data + Maps suggest it's a semi-detached"
+        }
+    ]
+
+    patches = {
+        25847: {"Property Type": "Semi Detached House", "archetype ID": 5},
    }