From 8432b7d202c24962bae64b04023600de13a6a03d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 11:50:28 +0000 Subject: [PATCH] creating the asset list class --- asset_list/AssetList.py | 64 ++++++++++++ etl/route_march_data_pull/app.py | 166 +++++++++++++++++++++---------- 2 files changed, 180 insertions(+), 50 deletions(-) create mode 100644 asset_list/AssetList.py diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py new file mode 100644 index 00000000..2a16e82f --- /dev/null +++ b/asset_list/AssetList.py @@ -0,0 +1,64 @@ +import os +import pandas as pd + + +class AssetList: + """ + This class is used to standardise asset lists so that we can process the core information in a consistent manner. + """ + + # These are the accepted methods we have for cleaning the address1 column + ADDRESS_1_CLEANING_METHODS = [ + "first_two_words", # This method will split on the fist two words, where the separator is a space + "first_word", # This method will split on the first word, where the separator is a space + "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber + "address1_extraction" # This method will use the NLP model to extract address1 + ] + + def __init__( + self, + local_filepath, + sheet_name, + address1_colname, + postcode_colname, + full_address_colname, + full_address_cols_to_concat=None, + missing_postcodes_method=None, + landlord_year_built=None, + landlord_uprn=None, + header=0 + ): + self.local_filepath = local_filepath + self.sheet_name = sheet_name + self.standardised_asset_list = None + # Read in the data + self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + + # We detect the presence of the non-intrusive columns + self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False + + # Names of columns + self.address1_colname = address1_colname + self.postcode_colname = postcode_colname + self.full_address_colname = full_address_colname + self.landlord_year_built = landlord_year_built + self.landlord_uprn = landlord_uprn + + # parameters for cleaning + self.full_address_cols_to_concat = full_address_cols_to_concat + self.missing_postcodes_method = missing_postcodes_method + + def standardise(self): + """ + This function is used to standardise the asset list + :return: standardised asset list + """ + + # We keep just the columns we care about and will work through the various columns and standardise + self.standardised_asset_list = self.raw_asset_list[ + [ + + ] + ] + + raise NotImplementedError diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 57239989..06082774 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -5,6 +5,7 @@ import pandas as pd import numpy as np from tqdm import tqdm from datetime import datetime +from asset_list.AssetList import AssetList from dotenv import load_dotenv from backend.SearchEpc import SearchEpc @@ -172,60 +173,107 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t raise ValueError(f"Method {method} not recognized") -def process_age_band(x, year_built_column): - if isinstance(x[year_built_column], datetime): - year_built = x[year_built_column].year - else: - year_built = float(x[year_built_column]) +def process_age_band(asset_list, year_built_column): + processed_age_band = [] + for _, x in asset_list.iterrows(): - if pd.isnull(x["Property Age Band"]) or ( - x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES - ) or pd.isnull(year_built): - return "No EPC Age Band" + if pd.isnull(x["Property Age Band"]) or ( + x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES + ): + processed_age_band.append({ + "row_id": x["row_id"], + "epc_year_lower_bound": None, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": "No EPC Age Band" + }) + continue - # We check if we have a numeric data - if x["Property Age Band"].isdigit(): - if year_built == float(x["Property Age Band"]): - return "EPC Age Band Matches Year Built" - if year_built > float(x["Property Age Band"]): - return "EPC Age Band is older than Year Built" - if year_built < float(x["Property Age Band"]): - return "EPC Age Band is newer than Year Built" + # We exatract the upper and lower bounds + if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]: + year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012 - # Handle specific case - if x["Property Age Band"] == "England and Wales: 2007 onwards": - if year_built >= 2007: - return "EPC Age Band Matches Year Built" - if year_built < 2007: - return "EPC Age Band is older than Year Built" + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound + else "EPC Age Band is older than Year Built" + ) - if x["Property Age Band"] == "England and Wales: 2012 onwards": - if year_built >= 2012: - return "EPC Age Band Matches Year Built" - if year_built < 2012: - return "EPC Age Band is older than Year Built" + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": year_lower_bound, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue - if x["Property Age Band"] == "England and Wales: before 1900": - if year_built < 1900: - return "EPC Age Band Matches Year Built" - if year_built >= 1900: - return "EPC Age Band is newer than Year Built" + if x["Property Age Band"] == "England and Wales: before 1900": - # Age band will be formatted as such: - # 'England and Wales: {upper date}-{lower date}' - # so we extract the lower and upper date - age_band = x["Property Age Band"].split(": ")[1] - lower_date, upper_date = age_band.split("-") - if year_built <= float(upper_date) and year_built >= float(lower_date): - return "EPC Age Band Matches Year Built" + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] < 1900 + else "EPC Age Band is newer than Year Built" + ) - if year_built > float(upper_date): - return "EPC Age Band is older than Year Built" + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": None, + "epc_year_upper_bound": 1899, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue - if year_built < float(upper_date): - return "EPC Age Band is newer than Year Built" + if x["Property Age Band"].isdigit(): - raise Exception("Should not reach here") + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"]) + else "EPC Age Band is different from Year Built" + ) + + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": int(x["Property Age Band"]), + "epc_year_upper_bound": int(x["Property Age Band"]), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + # Oherwise, we extract the upper and lower bounds + age_band = x["Property Age Band"].split(": ")[1] + lower_date, upper_date = age_band.split("-") + + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and ( + x[year_built_column] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date) + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": int(lower_date), + "epc_year_upper_bound": int(upper_date), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + + processed_age_band = pd.DataFrame(processed_age_band) + + return processed_age_band def app(): @@ -282,16 +330,27 @@ def app(): DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" DATA_FILENAME = "WESTWARD - completed list..xlsx" SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = "WFT EDIT Postcode" FULLADDRESS_COLUMN = "Address" ADDRESS1_COLUMN = None ADDRESS1_METHOD = "house_number_extraction" + ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None PROPERTY_YEAR_BUILT = "Build date" UPRN_COLUMN = "UPRN" # If we have the non-intrusives data, this should be true HAS_NON_INTRUSIVES = True + PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + + invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] + + asset_list = AssetList( + local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), + header=0, + sheet_name=SHEET_NAME + ) # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" @@ -608,8 +667,10 @@ def app(): # 3) If we have year in the asset list, we flag entries where the built year is different from the # EPC Age band if PROPERTY_YEAR_BUILT is not None: - asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( - lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 + # We process the age band and merge it on + processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT) + asset_list = asset_list.merge( + processed_age_band, how="left", on="row_id" ) if HAS_NON_INTRUSIVES: @@ -621,7 +682,12 @@ def app(): (asset_list["Construction"] == "CAVITY") & asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) & ( - (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995 + # Shold we defer to the year built provided by the HA? + (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995) + ) & + ( + # We check if the property type column contains one of the invalid property types + ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary)) ) ) @@ -633,9 +699,9 @@ def app(): (asset_list[PROPERTY_YEAR_BUILT] <= 1995) ) & ( - asset_list[] + asset_list[PROPERTY_TYPE_COLUMN] ) - ] + ] # 4) Flag properties that look like they're good candidates for solar installs # Firstly, flag if the fabric is completely done