From c7523f2e9076f4fb007f3e7e522091d0f6fcab2b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 30 Jun 2025 13:35:15 +0100 Subject: [PATCH] implementing year extraction for southern --- asset_list/AssetList.py | 60 ++++++++++++-------------- asset_list/app.py | 85 +++++++++++++++++++++++++++---------- asset_list/requirements.txt | 7 ++- 3 files changed, 94 insertions(+), 58 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 9ae05f05..91868a76 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -827,52 +827,44 @@ class AssetList: # We attempt to convert the year built to a datetime, by detecting the format and converting def extract_year(date_str): - """ - Extracts the year from a date string in the format '01-Jul-YYYY'. - Returns the extracted year as an integer or None if the format is incorrect. - """ - known_errors = [ + known_errors = { "#MULTIVALUE", + "ND", + "PIMSS EMPTY", + "UNKNOWN", "This cell has an external reference that can't be shown or edited. Editing this cell will " "remove the external reference.", - "ND", - 'PIMSS EMPTY', - "UNKNOWN" - ] + 0 + } - if pd.isnull(date_str) or date_str in known_errors or (date_str == 0): + if pd.isnull(date_str) or date_str in known_errors: return None - if isinstance(date_str, str): - match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str) - if match: - return int(match.group(1)) # Extract the year and convert to integer - if "-" in date_str: - - # Count the number of times we have "-", as we've seen double ranges - # (when we have extensions) so the format is like this: - # 'G: 1983-1990, H: 1991-1995' - if date_str.count("-") == 2: - # We have a range - return int(date_str.split("-")[1].split(",")[0]) - # We probably have a range - return int(date_str.split("-")[1].strip()) - + # Handle datetime if isinstance(date_str, datetime): return date_str.year - if isinstance(date_str, float): - if str(int(date_str)).isdigit() & (len(str(int(date_str))) == 4): + # Handle numeric year (float or int) + if isinstance(date_str, (int, float)): + if 1000 <= int(date_str) <= 2100: return int(date_str) - # Check if date_str is a year itself - if str(date_str).isdigit() & (len(str(date_str)) == 4): - return int(date_str) + # Now handle string-based logic + if isinstance(date_str, str): + # Direct date match e.g. 01-Jul-2021 + match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str) + if match: + return int(match.group(1)) - # Remove any non-numeric characters - date_str = re.sub(r"\D", "", str(date_str)) - if str(date_str).isdigit() & (len(str(date_str)) == 4): - return int(date_str) + # Find all 4-digit years in string + years = [int(y) for y in re.findall(r"\b(?:19|20)\d{2}\b", date_str)] + if years: + return max(years) # Return most recent year + + # If only numbers are present without format + numeric_str = re.sub(r"\D", "", date_str) + if len(numeric_str) == 4 and numeric_str.isdigit(): + return int(numeric_str) raise NotImplementedError(f"Unhandled format for year built, value is {date_str} - implement me") diff --git a/asset_list/app.py b/asset_list/app.py index 7c0023ce..ab718910 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -1,7 +1,6 @@ import os import json import pandas as pd -from pprint import pprint from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS @@ -60,40 +59,82 @@ def app(): Property UPRN """ - # NCHA - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA" - data_filename = "Energy Information MASTER June 2025.xlsx" - sheet_name = "Data" - postcode_column = 'Postcode' + # Southern - Optivo list + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/New Programme" + data_filename = "OPTIVO List for Warmfront.xlsx" + sheet_name = "AddressProfilingResults" + postcode_column = 'PostCode' fulladdress_column = "Address" address1_column = None address1_method = "house_number_extraction" address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "Build Date (HAR10)" + landlord_year_built = "Age" landlord_os_uprn = None - landlord_property_type = "Property Type (HAR10)" - landlord_built_form = "Build Form (EPC)" - landlord_wall_construction = "Wall Description" - landlord_roof_construction = None - landlord_heating_system = "HEAT Code" + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = "Walls" + landlord_roof_construction = "Roofs" + landlord_heating_system = "Heating" landlord_existing_pv = None - landlord_property_id = "Place ref" - landlord_sap = "EPC SAP" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] + landlord_property_id = "OrganisationReference" + landlord_sap = "SAP (9.92)" + outcomes_filename = [ + os.path.join(data_folder, "RT - Southern Housing Group - JJC.xlsx"), + os.path.join(data_folder, "RT - SOUTHERN OUTCOMES - SCIS Merged.xlsx"), + ] + outcomes_sheetname = ["Feedback", "Collated"] + outcomes_postcode = ["Poscode", "Postcode"] + outcomes_houseno = ["No.", "No"] + outcomes_id = ["UPRNs", None] + outcomes_address = ["Address", "Address"] + master_filepaths = [ + data_folder, "southern_submissions/CAVITY'S - DECEMBER 2018-Table 1.csv", + data_folder, "southern_submissions/CAVITY'S 2019-Table 1.csv", + data_folder, "CAVITY'S ECO4-Table 1.csv", + data_folder, "LOFT'S-Table 1.csv", + ] master_to_asset_list_filepath = None phase = False - ecosurv_landlords = None + ecosurv_landlords = "southern" asset_list_header = 0 landlord_block_reference = None master_id_colnames = [] + # NCHA + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA" + # data_filename = "Energy Information MASTER June 2025.xlsx" + # sheet_name = "Data" + # postcode_column = 'Postcode' + # fulladdress_column = "Address" + # address1_column = None + # address1_method = "house_number_extraction" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build Date (HAR10)" + # landlord_os_uprn = None + # landlord_property_type = "Property Type (HAR10)" + # landlord_built_form = "Build Form (EPC)" + # landlord_wall_construction = "Wall Description" + # landlord_roof_construction = None + # landlord_heating_system = "HEAT Code" + # landlord_existing_pv = None + # landlord_property_id = "Place ref" + # landlord_sap = "EPC SAP" + # outcomes_filename = None + # outcomes_sheetname = None + # outcomes_postcode = None + # outcomes_houseno = None + # outcomes_id = None + # outcomes_address = None + # master_filepaths = [] + # master_to_asset_list_filepath = None + # phase = False + # ecosurv_landlords = None + # asset_list_header = 0 + # landlord_block_reference = None + # master_id_colnames = [] + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Calico" # data_filename = "07.04 CALICO - Final List.xlsx" # asset_list_header = 2 diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index 99943397..b68706be 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -6,7 +6,10 @@ epc-api-python==1.0.2 thefuzz boto3 openpyxl -openai +openai>=1.3.5 tiktoken msgpack -beautifulsoup4 \ No newline at end of file +beautifulsoup4 +pydantic>=1.10.7 +typing-extensions>=4.5.0 +requests>=2.28.2