AIHA data review WIP

2026-07-27 23:35:01 +00:00 · 2024-09-13 15:31:43 +01:00 · 2024-09-13 15:31:43 +01:00 · 15f55c021f
commit 15f55c021f
parent ceb34979e4
2 changed files with 454 additions and 1 deletions
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -7,6 +7,9 @@ import pandas as pd
 import numpy as np
 from epc_api.client import EpcClient
 from backend.OrdnanceSurvey import OrdnanceSuveyClient
+from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
+from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
 from BaseUtility import Definitions
 from utils.logger import setup_logger
 from typing import List
@ -181,6 +184,7 @@ class SearchEpc:
        self.newest_epc = None
        self.older_epcs = None
        self.full_sap_epc = None
+        self.metadata = None

        # These are the address and postcode values, which we store in the database
        self.address_clean = None
@ -306,7 +310,10 @@ class SearchEpc:
        if (property_type is None) and (address is None):
            return rows

-        if len(uprns) == 1:
+        unique_property_types = {r["property-type"] for r in rows}
+
+        # We allow for variation in property type across flats/maisonettes
+        if (len(uprns) == 1) and ((len(unique_property_types) == 1) or unique_property_types == {"Flat", "Maisonette"}):
            return rows

        if property_type is not None:
@ -784,3 +791,86 @@ class SearchEpc:
        self.address_clean = self.ordnance_survey_client.address_os
        self.postcode_clean = self.ordnance_survey_client.postcode_os
        return
+
+    def check_attribute_variations(self):
+        attribute_map = {
+            "walls-description": {
+                "cleaner": WallAttributes,
+                "attribute": [
+                    "is_cavity_wall", "is_solid_brick", "is_system_built", "is_timber_frame",
+                    "is_granite_or_whinstone", "is_cob", "is_sandstone_or_limestone", "is_park_home"
+                ],
+                "name": "has_wall_type_ever_varied"
+            },
+            "roof-description": {
+                "cleaner": RoofAttributes,
+                "attribute": [
+                    "is_flat", "is_pitched", "is_roof_room", "is_thatched", "has_dwelling_above"
+                ],
+                "name": "has_roof_type_ever_varied"
+            },
+            "floor-description": {
+                "cleaner": FloorAttributes,
+                "attribute": [
+                    "is_to_unheated_space", "is_to_external_air", "is_suspended", "is_solid", "is_to_external_air",
+                ],
+                "name": "has_floor_type_ever_varied"
+            }
+        }
+
+        attribute_variations = {}
+        for attribute, attribute_objs in attribute_map.items():
+            attribute_variations[attribute_objs["name"]] = False
+            cleaner = attribute_objs["cleaner"]
+            type_timeline = pd.DataFrame([cleaner(epc[attribute]).process() for epc in self.older_epcs] + [
+                cleaner(self.newest_epc[attribute]).process()
+            ])
+            # For eac col in attribute_objs["attribute"] we check if the timeline has ever varied, i.e has gone
+            # from true to false
+            for col in attribute_objs["attribute"]:
+                if type_timeline[col].nunique() > 1:
+                    attribute_variations[attribute_objs["name"]] = True
+                    break
+
+            return attribute_variations
+
+    def identify_flat_floor(self):
+        # If there is no dwelling above, it is a top floor flat
+        processed_roof = RoofAttributes(self.newest_epc["roof-description"]).process()
+        if not processed_roof["has_dwelling_above"]:
+            return "top"
+
+        # We know that there is a dwelling above. If there's also a drwelling below, it is a mid floor flat
+        processed_floor = FloorAttributes(self.newest_epc["floor-description"]).process()
+        if processed_floor["another_property_below"]:
+            return "mid"
+
+        # Otherwise ground floor
+        return "ground"
+
+    def get_metadata(self):
+        if self.newest_epc is None:
+            raise ValueError("No EPC data available")
+
+        # We check if the property has ever been downgraded on SAP
+        has_sap_ever_downgraded = False
+        sap_timeline = [int(epc["current-energy-efficiency"]) for epc in self.older_epcs] + [
+            int(self.newest_epc["current-energy-efficiency"])
+        ]
+        # We check if there has ever been a decrease by differencing
+        has_sap_ever_downgraded = any(np.diff(sap_timeline) < 0)
+
+        # We check if the wall type has ever varied over time
+        attribute_varations = self.check_attribute_variations()
+
+        # If the property is a flat, we distinguish between top, mid, ground floor
+        floor = None
+        if self.newest_epc["property-type"] == "Flat":
+            floor = self.identify_flat_floor()
+
+        self.metadata = {
+            "days_since_last_epc": (pd.Timestamp.now() - pd.Timestamp(self.newest_epc["lodgement-date"])).days,
+            "has_sap_ever_downgraded": has_sap_ever_downgraded,
+            "floor": floor,
+            **attribute_varations
+        }
--- a/etl/customers/aiha/epc_data_pull.py
+++ b/etl/customers/aiha/epc_data_pull.py
@ -0,0 +1,363 @@
+import os
+from tqdm import tqdm
+from dotenv import load_dotenv
+import pandas as pd
+from backend.SearchEpc import SearchEpc
+from etl.spatial.OpenUprnClient import OpenUprnClient
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+pd.set_option('display.max_rows', 500)
+pd.set_option('display.max_columns', 500)
+pd.set_option('display.width', 1000)
+
+
+def app():
+    # Retrieve EPC data for the SHDF AIHA portfolio
+
+    data = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Khalim Review - 240902 - KSQ - AIHA - SHDF Wave "
+        "3 bid - Supplementary information.xlsx",
+        sheet_name="All units information",
+        header=3
+    )
+
+    # Remove the .eg row
+    data = data.tail(-1)
+
+    # Remove the bottom 2 rows
+    data = data.head(-2)
+    data = data.reset_index(drop=True)
+    data["row_id"] = data.index
+
+    ammendments = {
+        "12 11-18 Schonfeld Square": "12 Schonfeld Square",
+        "35 35-37 Schonfeld Square": "35 Schonfeld Square",
+        '77 Schonfeld Square': '77 Lordship Road',
+        "83 Lordship Road (Schonfeld Square)": "83 Lordship Road",
+        "A 80 Bethune Road": "80A Bethune Road",
+        "86B Bethune Road": "Flat B, 86 Bethune Road",
+        "22 Glendale Road": "22 Glendale Avenue",
+        "121 Southbourne Road": "121 Southbourne Grove",
+    }
+
+    no_epc = [
+        "80B Bethune Road",
+        "89B Manor Road",
+        "12 Monkville Avenue",
+        "9 Greenview",
+    ]
+
+    property_type_map = {
+        "House, mid-terrace": "House",
+        "House, end terrace": "House",
+        "House, semi-detached": "House",
+        "House, detached": "House",
+        "Flat": "Flat",
+    }
+
+    epc_data = []
+    epc_metadata = []
+    for _, home in tqdm(data.iterrows(), total=len(data)):
+
+        # Build address 1 based on if there is:
+        # 1) Address letter or number
+        # 2) Street address
+
+        modified = False
+        address1 = ""
+        address1_backup = ""
+
+        if home["Address letter or number"] in ["A", "B", "C"]:
+
+            house_no = home['Street address'].split(' ')[0]
+            street = ' '.join(home['Street address'].split(' ')[1:])
+            address1 = f"{house_no}{home['Address letter or number']} {street}"
+
+            address1_backup = f"Flat {home['Address letter or number']} {house_no} {street}"
+            modified = True
+
+        else:
+            if not pd.isnull(home["Address letter or number"]):
+                address1 += f"{home['Address letter or number']} "
+            if not pd.isnull(home["Street address"]):
+                address1 += f"{home['Street address']}"
+        address1 = address1.strip()
+
+        if address1.split(" ")[-1].lower() == "rd":
+            # Replace with road
+            address1 = address1.lower().replace(" rd", " road")
+
+        # Specific ammendments
+        if address1 in ammendments:
+            address1 = ammendments[address1]
+
+        if address1 in no_epc:
+            continue
+
+        searcher = SearchEpc(
+            address1=address1,
+            postcode=home["Postcode"],
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key="",
+            property_type=property_type_map[home["Property type"]]
+        )
+        searcher.find_property(skip_os=True)
+
+        if searcher.newest_epc is None and modified:
+            searcher = SearchEpc(
+                address1=address1_backup,
+                postcode=home["Postcode"],
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key="",
+                property_type=property_type_map[home["Property type"]]
+            )
+            searcher.find_property(skip_os=True)
+
+        if searcher.newest_epc is None:
+            raise Exception("Not found")
+
+        epc_data.append(
+            {
+                "row_id": home["row_id"],
+                **searcher.newest_epc
+            }
+        )
+
+        searcher.get_metadata()
+
+        epc_metadata.append(
+            {
+                "row_id": home["row_id"],
+                "address": address1,
+                "postcode": home["Postcode"],
+                **searcher.metadata
+            }
+        )
+
+    epc_metadata = pd.DataFrame(epc_metadata)
+    epc_data = pd.DataFrame(epc_data)
+
+    # Check matched addresses
+    matched_addresses = epc_metadata[["row_id", "address", "postcode"]].copy()
+    matched_addresses = matched_addresses.merge(
+        data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
+    )
+
+    # We look for differences between the asset list and the EPC data
+    comparison_cols = {
+        "Property type": [
+            {
+                "epc_col": "property-type",
+                "map": property_type_map
+            },
+            {
+                "epc_col": "built-form",
+                "map": {
+                    "House, mid-terrace": "Mid-Terrace",
+                    "House, end terrace": "End-Terrace",
+                    "House, semi-detached": "Semi-Detached",
+                    "House, detached": "Detached",
+                    "Flat": "Flat",
+                }
+            }
+        ],
+        "Energy starting band (EPC)": [
+            {
+                "epc_col": "current-energy-rating",
+                "map": {}
+            }
+        ],
+        "Wall type": [
+            {
+                "epc_col": "walls-description",
+                "search_terms": {
+                    "solid": "Solid brick",
+                    "cavity": "Cavity wall",
+                    "solid - internal lining": "Solid brick",
+                }
+            }
+        ],
+        "Roof type": [
+            {
+                "epc_col": "roof-description",
+                "search_terms": {
+                    "pitched": "Pitched",
+                    "n/a - (flat above)": "another dwelling above"
+                }
+            }
+        ],
+        "Floor type": [
+            {
+                "epc_col": "floor-description",
+                "search_terms": {
+                    "solid": "Solid",
+                    "suspended": "Suspended",
+                    "solid - floating floor for services": "Solid"
+                }
+            }
+        ],
+    }
+
+    import re
+    differences = []
+    for asset_list_col, list_of_configs in comparison_cols.items():
+
+        if asset_list_col in ["Wall type", "Roof type", "Floor type"]:
+            config = list_of_configs[0]
+            # We handle this differently
+            remapped = data[["row_id", asset_list_col]].copy()
+            # Strip the asset list col incase of leading/trailing spaces
+            remapped[asset_list_col] = remapped[asset_list_col].str.strip()
+            remapped[asset_list_col] = remapped[asset_list_col].str.lower()
+            remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner")
+            # We do a search term check
+            remapped["Match"] = None
+            for search_term, epc_term in config["search_terms"].items():
+                if "/" in search_term:
+                    escaped_search_term = re.escape(search_term)
+                    remapped.loc[remapped[asset_list_col].str.contains(escaped_search_term), "Match"] = (
+                        remapped.loc[
+                            remapped[asset_list_col].str.contains(escaped_search_term), config["epc_col"]
+                        ].str.contains(epc_term)
+                    )
+                else:
+                    remapped.loc[remapped[asset_list_col].str.contains(search_term), "Match"] = (
+                        remapped.loc[
+                            remapped[asset_list_col].str.contains(search_term), config["epc_col"]
+                        ].str.contains(epc_term)
+                    )
+
+            if pd.isnull(remapped["Match"]).sum():
+                raise Exception("Not all matched")
+
+            remapped["Match"] = remapped["Match"].astype(bool)
+
+            if not all(remapped["Match"]):
+                differences.append(
+                    {
+                        "Column": asset_list_col,
+                        "Differences": remapped[~remapped["Match"]],
+                    }
+                )
+
+            continue
+
+        for config in list_of_configs:
+
+            remapped = data[["row_id", asset_list_col]].copy()
+            if config["map"]:
+                remapped[asset_list_col] = remapped[asset_list_col].map(config["map"])
+
+            # Merge on
+            remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner")
+            remapped["Match"] = remapped[asset_list_col] == remapped[config["epc_col"]]
+            if not all(remapped["Match"]):
+                differences.append(
+                    {
+                        "Column": asset_list_col,
+                        "Differences": remapped[~remapped["Match"]],
+                    }
+                )
+
+    # Check for property type
+    property_type_differences = differences[0]["Differences"].copy()
+    property_type_differences = property_type_differences.merge(
+        data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
+    )
+    print(property_type_differences)
+
+    # Check for built form
+    built_form_differences = differences[1]["Differences"].copy()
+    built_form_differences = built_form_differences[built_form_differences["Property type"] != "Flat"]
+    built_form_differences = built_form_differences.merge(
+        data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
+    )
+    print(built_form_differences)
+
+    # Check for energy rating
+    energy_rating_differences = differences[2]["Differences"].copy()
+    energy_rating_differences = energy_rating_differences.merge(
+        data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
+    ).merge(
+        epc_data[["row_id", "uprn"]], on="row_id", how="inner"
+    )
+    print(energy_rating_differences)
+
+    # Check for wall type
+    wall_type_differences = differences[3]["Differences"].copy()
+    wall_type_differences = wall_type_differences.merge(
+        data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
+    ).merge(
+        epc_data[["row_id", "uprn"]], on="row_id", how="inner"
+    )
+    print(wall_type_differences)  # Many wall type differences
+
+    # Check for roof type
+    roof_type_differences = differences[4]["Differences"].copy()
+    roof_type_differences = roof_type_differences.merge(
+        data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
+    ).merge(
+        epc_data[["row_id", "uprn"]], on="row_id", how="inner"
+    )
+    print(roof_type_differences)  # Many roof type differences
+
+    # Check for floor type
+    floor_type_differences = differences[5]["Differences"].copy()
+    floor_type_differences = floor_type_differences.merge(
+        data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
+    ).merge(
+        epc_data[["row_id", "uprn"]], on="row_id", how="inner"
+    )
+    print(floor_type_differences)  # Many floor type differences
+
+    # TODO: 47 Ashtead Road [100021024699] shows solid brick wall on EPC - is probably cavity wall
+
+    # We have the EPC data. Let's check conservation area/historic/listed building status
+    portfolio_spatial_data = OpenUprnClient.get_spatial_data(
+        epc_data["uprn"].unique().tolist(), bucket_name="retrofit-data-dev"
+    )
+
+    portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str)
+
+    spatial_data = data[["row_id", "Planning constraints"]].merge(
+        epc_data[["row_id", "uprn"]], on="row_id", how="left",
+
+    ).merge(
+        portfolio_spatial_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]],
+        left_on="uprn",
+        right_on="UPRN", how="left"
+    )
+
+    spatial_data[
+        (spatial_data["Planning constraints"] == "None")
+    ]["conservation_status"].value_counts()
+
+    # One property is in a conservation area, that was not picked up in the asset data
+    print(spatial_data[
+        (spatial_data["Planning constraints"] == "None") &
+        (spatial_data["conservation_status"] == True)
+        ].merge(
+        data[["row_id", "Address letter or number", "Street address", "Postcode"]], on="row_id", how="left"
+    ))
+
+    # All properties match up apart from one where the asset data indicates it's in a conservation area, however
+    # the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio
+
+    # Draft archetyping
+    archetyping_data = data[
+        [
+            "row_id",
+            "Energy starting band (EPC)",
+            "Property type",
+            "Property year built",
+            "Gross internal area (sqm)",
+            "Current heating system type",
+            "Wall type",
+            "Floor type",
+            "Roof type",
+            "Window type",
+            "Location (Floor)",
+        ]
+    ]