implementing year extraction for southern

This commit is contained in:
Khalim Conn-Kowlessar 2025-06-30 13:35:15 +01:00
parent 2040c2a957
commit c7523f2e90
3 changed files with 94 additions and 58 deletions

View file

@ -827,52 +827,44 @@ class AssetList:
# We attempt to convert the year built to a datetime, by detecting the format and converting
def extract_year(date_str):
"""
Extracts the year from a date string in the format '01-Jul-YYYY'.
Returns the extracted year as an integer or None if the format is incorrect.
"""
known_errors = [
known_errors = {
"#MULTIVALUE",
"ND",
"PIMSS EMPTY",
"UNKNOWN",
"This cell has an external reference that can't be shown or edited. Editing this cell will "
"remove the external reference.",
"ND",
'PIMSS EMPTY',
"UNKNOWN"
]
0
}
if pd.isnull(date_str) or date_str in known_errors or (date_str == 0):
if pd.isnull(date_str) or date_str in known_errors:
return None
if isinstance(date_str, str):
match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str)
if match:
return int(match.group(1)) # Extract the year and convert to integer
if "-" in date_str:
# Count the number of times we have "-", as we've seen double ranges
# (when we have extensions) so the format is like this:
# 'G: 1983-1990, H: 1991-1995'
if date_str.count("-") == 2:
# We have a range
return int(date_str.split("-")[1].split(",")[0])
# We probably have a range
return int(date_str.split("-")[1].strip())
# Handle datetime
if isinstance(date_str, datetime):
return date_str.year
if isinstance(date_str, float):
if str(int(date_str)).isdigit() & (len(str(int(date_str))) == 4):
# Handle numeric year (float or int)
if isinstance(date_str, (int, float)):
if 1000 <= int(date_str) <= 2100:
return int(date_str)
# Check if date_str is a year itself
if str(date_str).isdigit() & (len(str(date_str)) == 4):
return int(date_str)
# Now handle string-based logic
if isinstance(date_str, str):
# Direct date match e.g. 01-Jul-2021
match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str)
if match:
return int(match.group(1))
# Remove any non-numeric characters
date_str = re.sub(r"\D", "", str(date_str))
if str(date_str).isdigit() & (len(str(date_str)) == 4):
return int(date_str)
# Find all 4-digit years in string
years = [int(y) for y in re.findall(r"\b(?:19|20)\d{2}\b", date_str)]
if years:
return max(years) # Return most recent year
# If only numbers are present without format
numeric_str = re.sub(r"\D", "", date_str)
if len(numeric_str) == 4 and numeric_str.isdigit():
return int(numeric_str)
raise NotImplementedError(f"Unhandled format for year built, value is {date_str} - implement me")

View file

@ -1,7 +1,6 @@
import os
import json
import pandas as pd
from pprint import pprint
from asset_list.AssetList import AssetList
from asset_list.mappings.property_type import PROPERTY_MAPPING
from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS
@ -60,40 +59,82 @@ def app():
Property UPRN
"""
# NCHA
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA"
data_filename = "Energy Information MASTER June 2025.xlsx"
sheet_name = "Data"
postcode_column = 'Postcode'
# Southern - Optivo list
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/New Programme"
data_filename = "OPTIVO List for Warmfront.xlsx"
sheet_name = "AddressProfilingResults"
postcode_column = 'PostCode'
fulladdress_column = "Address"
address1_column = None
address1_method = "house_number_extraction"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = "Build Date (HAR10)"
landlord_year_built = "Age"
landlord_os_uprn = None
landlord_property_type = "Property Type (HAR10)"
landlord_built_form = "Build Form (EPC)"
landlord_wall_construction = "Wall Description"
landlord_roof_construction = None
landlord_heating_system = "HEAT Code"
landlord_property_type = None
landlord_built_form = None
landlord_wall_construction = "Walls"
landlord_roof_construction = "Roofs"
landlord_heating_system = "Heating"
landlord_existing_pv = None
landlord_property_id = "Place ref"
landlord_sap = "EPC SAP"
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
outcomes_address = None
master_filepaths = []
landlord_property_id = "OrganisationReference"
landlord_sap = "SAP (9.92)"
outcomes_filename = [
os.path.join(data_folder, "RT - Southern Housing Group - JJC.xlsx"),
os.path.join(data_folder, "RT - SOUTHERN OUTCOMES - SCIS Merged.xlsx"),
]
outcomes_sheetname = ["Feedback", "Collated"]
outcomes_postcode = ["Poscode", "Postcode"]
outcomes_houseno = ["No.", "No"]
outcomes_id = ["UPRNs", None]
outcomes_address = ["Address", "Address"]
master_filepaths = [
data_folder, "southern_submissions/CAVITY'S - DECEMBER 2018-Table 1.csv",
data_folder, "southern_submissions/CAVITY'S 2019-Table 1.csv",
data_folder, "CAVITY'S ECO4-Table 1.csv",
data_folder, "LOFT'S-Table 1.csv",
]
master_to_asset_list_filepath = None
phase = False
ecosurv_landlords = None
ecosurv_landlords = "southern"
asset_list_header = 0
landlord_block_reference = None
master_id_colnames = []
# NCHA
# data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA"
# data_filename = "Energy Information MASTER June 2025.xlsx"
# sheet_name = "Data"
# postcode_column = 'Postcode'
# fulladdress_column = "Address"
# address1_column = None
# address1_method = "house_number_extraction"
# address_cols_to_concat = []
# missing_postcodes_method = None
# landlord_year_built = "Build Date (HAR10)"
# landlord_os_uprn = None
# landlord_property_type = "Property Type (HAR10)"
# landlord_built_form = "Build Form (EPC)"
# landlord_wall_construction = "Wall Description"
# landlord_roof_construction = None
# landlord_heating_system = "HEAT Code"
# landlord_existing_pv = None
# landlord_property_id = "Place ref"
# landlord_sap = "EPC SAP"
# outcomes_filename = None
# outcomes_sheetname = None
# outcomes_postcode = None
# outcomes_houseno = None
# outcomes_id = None
# outcomes_address = None
# master_filepaths = []
# master_to_asset_list_filepath = None
# phase = False
# ecosurv_landlords = None
# asset_list_header = 0
# landlord_block_reference = None
# master_id_colnames = []
# data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Calico"
# data_filename = "07.04 CALICO - Final List.xlsx"
# asset_list_header = 2

View file

@ -6,7 +6,10 @@ epc-api-python==1.0.2
thefuzz
boto3
openpyxl
openai
openai>=1.3.5
tiktoken
msgpack
beautifulsoup4
beautifulsoup4
pydantic>=1.10.7
typing-extensions>=4.5.0
requests>=2.28.2