creating the asset list class

2026-07-27 23:35:01 +00:00 · 2025-02-19 11:50:28 +00:00 · 2025-02-19 11:50:28 +00:00 · 8432b7d202
commit 8432b7d202
parent 55d2df1787
2 changed files with 180 additions and 50 deletions
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@ -0,0 +1,64 @@
+import os
+import pandas as pd
+
+
+class AssetList:
+    """
+    This class is used to standardise asset lists so that we can process the core information in a consistent manner.
+    """
+
+    # These are the accepted methods we have for cleaning the address1 column
+    ADDRESS_1_CLEANING_METHODS = [
+        "first_two_words",  # This method will split on the fist two words, where the separator is a space
+        "first_word",  # This method will split on the first word, where the separator is a space
+        "house_number_extraction",  # This method will use the NLP model in SearchEPC to extract the housenumber
+        "address1_extraction"  # This method will use the NLP model to extract address1
+    ]
+
+    def __init__(
+        self,
+        local_filepath,
+        sheet_name,
+        address1_colname,
+        postcode_colname,
+        full_address_colname,
+        full_address_cols_to_concat=None,
+        missing_postcodes_method=None,
+        landlord_year_built=None,
+        landlord_uprn=None,
+        header=0
+    ):
+        self.local_filepath = local_filepath
+        self.sheet_name = sheet_name
+        self.standardised_asset_list = None
+        # Read in the data
+        self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
+
+        # We detect the presence of the non-intrusive columns
+        self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
+
+        # Names of columns
+        self.address1_colname = address1_colname
+        self.postcode_colname = postcode_colname
+        self.full_address_colname = full_address_colname
+        self.landlord_year_built = landlord_year_built
+        self.landlord_uprn = landlord_uprn
+
+        # parameters for cleaning
+        self.full_address_cols_to_concat = full_address_cols_to_concat
+        self.missing_postcodes_method = missing_postcodes_method
+
+    def standardise(self):
+        """
+        This function is used to standardise the asset list
+        :return: standardised asset list
+        """
+
+        # We keep just the columns we care about and will work through the various columns and standardise
+        self.standardised_asset_list = self.raw_asset_list[
+            [
+
+            ]
+        ]
+
+        raise NotImplementedError
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@ -5,6 +5,7 @@ import pandas as pd
 import numpy as np
 from tqdm import tqdm
 from datetime import datetime
+from asset_list.AssetList import AssetList

 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
@ -172,60 +173,107 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t
    raise ValueError(f"Method {method} not recognized")


-def process_age_band(x, year_built_column):
-    if isinstance(x[year_built_column], datetime):
-        year_built = x[year_built_column].year
-    else:
-        year_built = float(x[year_built_column])
+def process_age_band(asset_list, year_built_column):
+    processed_age_band = []
+    for _, x in asset_list.iterrows():

-    if pd.isnull(x["Property Age Band"]) or (
-        x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
-    ) or pd.isnull(year_built):
-        return "No EPC Age Band"
+        if pd.isnull(x["Property Age Band"]) or (
+            x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
+        ):
+            processed_age_band.append({
+                "row_id": x["row_id"],
+                "epc_year_lower_bound": None,
+                "epc_year_upper_bound": None,
+                "Does Age Match EPC Age Band?": "No EPC Age Band"
+            })
+            continue

-    # We check if we have a numeric data
-    if x["Property Age Band"].isdigit():
-        if year_built == float(x["Property Age Band"]):
-            return "EPC Age Band Matches Year Built"
-        if year_built > float(x["Property Age Band"]):
-            return "EPC Age Band is older than Year Built"
-        if year_built < float(x["Property Age Band"]):
-            return "EPC Age Band is newer than Year Built"
+        # We exatract the upper and lower bounds
+        if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]:
+            year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012

-    # Handle specific case
-    if x["Property Age Band"] == "England and Wales: 2007 onwards":
-        if year_built >= 2007:
-            return "EPC Age Band Matches Year Built"
-        if year_built < 2007:
-            return "EPC Age Band is older than Year Built"
+            if pd.isnull(x[year_built_column]):
+                age_band_matches = "No Year Built From Landlord"
+            else:
+                age_band_matches = (
+                    "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound
+                    else "EPC Age Band is older than Year Built"
+                )

-    if x["Property Age Band"] == "England and Wales: 2012 onwards":
-        if year_built >= 2012:
-            return "EPC Age Band Matches Year Built"
-        if year_built < 2012:
-            return "EPC Age Band is older than Year Built"
+            processed_age_band.append(
+                {
+                    "row_id": x["row_id"],
+                    "epc_year_lower_bound": year_lower_bound,
+                    "epc_year_upper_bound": None,
+                    "Does Age Match EPC Age Band?": age_band_matches
+                }
+            )
+            continue

-    if x["Property Age Band"] == "England and Wales: before 1900":
-        if year_built < 1900:
-            return "EPC Age Band Matches Year Built"
-        if year_built >= 1900:
-            return "EPC Age Band is newer than Year Built"
+        if x["Property Age Band"] == "England and Wales: before 1900":

-    # Age band will be formatted as such:
-    # 'England and Wales: {upper date}-{lower date}'
-    # so we extract the lower and upper date
-    age_band = x["Property Age Band"].split(": ")[1]
-    lower_date, upper_date = age_band.split("-")
-    if year_built <= float(upper_date) and year_built >= float(lower_date):
-        return "EPC Age Band Matches Year Built"
+            if pd.isnull(x[year_built_column]):
+                age_band_matches = "No Year Built From Landlord"
+            else:
+                age_band_matches = (
+                    "EPC Age Band Matches Year Built" if x[year_built_column] < 1900
+                    else "EPC Age Band is newer than Year Built"
+                )

-    if year_built > float(upper_date):
-        return "EPC Age Band is older than Year Built"
+            processed_age_band.append(
+                {
+                    "row_id": x["row_id"],
+                    "epc_year_lower_bound": None,
+                    "epc_year_upper_bound": 1899,
+                    "Does Age Match EPC Age Band?": age_band_matches
+                }
+            )
+            continue

-    if year_built < float(upper_date):
-        return "EPC Age Band is newer than Year Built"
+        if x["Property Age Band"].isdigit():

-    raise Exception("Should not reach here")
+            if pd.isnull(x[year_built_column]):
+                age_band_matches = "No Year Built From Landlord"
+            else:
+                age_band_matches = (
+                    "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"])
+                    else "EPC Age Band is different from Year Built"
+                )
+
+            processed_age_band.append(
+                {
+                    "row_id": x["row_id"],
+                    "epc_year_lower_bound": int(x["Property Age Band"]),
+                    "epc_year_upper_bound": int(x["Property Age Band"]),
+                    "Does Age Match EPC Age Band?": age_band_matches
+                }
+            )
+            continue
+
+        # Oherwise, we extract the upper and lower bounds
+        age_band = x["Property Age Band"].split(": ")[1]
+        lower_date, upper_date = age_band.split("-")
+
+        age_band_matches = (
+            "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and (
+                x[year_built_column] <= float(upper_date)
+            )
+            else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date)
+            else "EPC Age Band is newer than Year Built"
+        )
+
+        processed_age_band.append(
+            {
+                "row_id": x["row_id"],
+                "epc_year_lower_bound": int(lower_date),
+                "epc_year_upper_bound": int(upper_date),
+                "Does Age Match EPC Age Band?": age_band_matches
+            }
+        )
+
+    processed_age_band = pd.DataFrame(processed_age_band)
+
+    return processed_age_band


 def app():
@ -282,16 +330,27 @@ def app():
    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
    DATA_FILENAME = "WESTWARD - completed list..xlsx"
    SHEET_NAME = "Sheet1"
+
    POSTCODE_COLUMN = "WFT EDIT Postcode"
    FULLADDRESS_COLUMN = "Address"
    ADDRESS1_COLUMN = None
    ADDRESS1_METHOD = "house_number_extraction"
+
    ADDRESS_COLS_TO_CONCAT = []
    MISSING_POSTCODES_METHOD = None
    PROPERTY_YEAR_BUILT = "Build date"
    UPRN_COLUMN = "UPRN"
    # If we have the non-intrusives data, this should be true
    HAS_NON_INTRUSIVES = True
+    PROPERTY_TYPE_COLUMN = "Location type"  # This will be used to identify and remove bedsits
+
+    invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
+
+    asset_list = AssetList(
+        local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
+        header=0,
+        sheet_name=SHEET_NAME
+    )

    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
    # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
@ -608,8 +667,10 @@ def app():
    # 3) If we have year in the asset list, we flag entries where the built year is different from the
    # EPC Age band
    if PROPERTY_YEAR_BUILT is not None:
-        asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
-            lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
+        # We process the age band and merge it on
+        processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT)
+        asset_list = asset_list.merge(
+            processed_age_band, how="left", on="row_id"
        )

    if HAS_NON_INTRUSIVES:
@ -621,7 +682,12 @@ def app():
            (asset_list["Construction"] == "CAVITY") &
            asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) &
            (
-                (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995
+                # Shold we defer to the year built provided by the HA?
+                (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995)
+            ) &
+            (
+                # We check if the property type column contains one of the invalid property types
+                ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary))
            )
        )

@ -633,9 +699,9 @@ def app():
                (asset_list[PROPERTY_YEAR_BUILT] <= 1995)
            ) &
            (
-                asset_list[]
+                asset_list[PROPERTY_TYPE_COLUMN]
            )
-        ]
+            ]

    # 4) Flag properties that look like they're good candidates for solar installs
    # Firstly, flag if the fabric is completely done