mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
implementing year extraction for southern
This commit is contained in:
parent
2040c2a957
commit
c7523f2e90
3 changed files with 94 additions and 58 deletions
|
|
@ -827,52 +827,44 @@ class AssetList:
|
|||
# We attempt to convert the year built to a datetime, by detecting the format and converting
|
||||
|
||||
def extract_year(date_str):
|
||||
"""
|
||||
Extracts the year from a date string in the format '01-Jul-YYYY'.
|
||||
Returns the extracted year as an integer or None if the format is incorrect.
|
||||
"""
|
||||
known_errors = [
|
||||
known_errors = {
|
||||
"#MULTIVALUE",
|
||||
"ND",
|
||||
"PIMSS EMPTY",
|
||||
"UNKNOWN",
|
||||
"This cell has an external reference that can't be shown or edited. Editing this cell will "
|
||||
"remove the external reference.",
|
||||
"ND",
|
||||
'PIMSS EMPTY',
|
||||
"UNKNOWN"
|
||||
]
|
||||
0
|
||||
}
|
||||
|
||||
if pd.isnull(date_str) or date_str in known_errors or (date_str == 0):
|
||||
if pd.isnull(date_str) or date_str in known_errors:
|
||||
return None
|
||||
|
||||
if isinstance(date_str, str):
|
||||
match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str)
|
||||
if match:
|
||||
return int(match.group(1)) # Extract the year and convert to integer
|
||||
if "-" in date_str:
|
||||
|
||||
# Count the number of times we have "-", as we've seen double ranges
|
||||
# (when we have extensions) so the format is like this:
|
||||
# 'G: 1983-1990, H: 1991-1995'
|
||||
if date_str.count("-") == 2:
|
||||
# We have a range
|
||||
return int(date_str.split("-")[1].split(",")[0])
|
||||
# We probably have a range
|
||||
return int(date_str.split("-")[1].strip())
|
||||
|
||||
# Handle datetime
|
||||
if isinstance(date_str, datetime):
|
||||
return date_str.year
|
||||
|
||||
if isinstance(date_str, float):
|
||||
if str(int(date_str)).isdigit() & (len(str(int(date_str))) == 4):
|
||||
# Handle numeric year (float or int)
|
||||
if isinstance(date_str, (int, float)):
|
||||
if 1000 <= int(date_str) <= 2100:
|
||||
return int(date_str)
|
||||
|
||||
# Check if date_str is a year itself
|
||||
if str(date_str).isdigit() & (len(str(date_str)) == 4):
|
||||
return int(date_str)
|
||||
# Now handle string-based logic
|
||||
if isinstance(date_str, str):
|
||||
# Direct date match e.g. 01-Jul-2021
|
||||
match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# Remove any non-numeric characters
|
||||
date_str = re.sub(r"\D", "", str(date_str))
|
||||
if str(date_str).isdigit() & (len(str(date_str)) == 4):
|
||||
return int(date_str)
|
||||
# Find all 4-digit years in string
|
||||
years = [int(y) for y in re.findall(r"\b(?:19|20)\d{2}\b", date_str)]
|
||||
if years:
|
||||
return max(years) # Return most recent year
|
||||
|
||||
# If only numbers are present without format
|
||||
numeric_str = re.sub(r"\D", "", date_str)
|
||||
if len(numeric_str) == 4 and numeric_str.isdigit():
|
||||
return int(numeric_str)
|
||||
|
||||
raise NotImplementedError(f"Unhandled format for year built, value is {date_str} - implement me")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
import json
|
||||
import pandas as pd
|
||||
from pprint import pprint
|
||||
from asset_list.AssetList import AssetList
|
||||
from asset_list.mappings.property_type import PROPERTY_MAPPING
|
||||
from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS
|
||||
|
|
@ -60,40 +59,82 @@ def app():
|
|||
Property UPRN
|
||||
"""
|
||||
|
||||
# NCHA
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA"
|
||||
data_filename = "Energy Information MASTER June 2025.xlsx"
|
||||
sheet_name = "Data"
|
||||
postcode_column = 'Postcode'
|
||||
# Southern - Optivo list
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/New Programme"
|
||||
data_filename = "OPTIVO List for Warmfront.xlsx"
|
||||
sheet_name = "AddressProfilingResults"
|
||||
postcode_column = 'PostCode'
|
||||
fulladdress_column = "Address"
|
||||
address1_column = None
|
||||
address1_method = "house_number_extraction"
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = "Build Date (HAR10)"
|
||||
landlord_year_built = "Age"
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = "Property Type (HAR10)"
|
||||
landlord_built_form = "Build Form (EPC)"
|
||||
landlord_wall_construction = "Wall Description"
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = "HEAT Code"
|
||||
landlord_property_type = None
|
||||
landlord_built_form = None
|
||||
landlord_wall_construction = "Walls"
|
||||
landlord_roof_construction = "Roofs"
|
||||
landlord_heating_system = "Heating"
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "Place ref"
|
||||
landlord_sap = "EPC SAP"
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
outcomes_postcode = None
|
||||
outcomes_houseno = None
|
||||
outcomes_id = None
|
||||
outcomes_address = None
|
||||
master_filepaths = []
|
||||
landlord_property_id = "OrganisationReference"
|
||||
landlord_sap = "SAP (9.92)"
|
||||
outcomes_filename = [
|
||||
os.path.join(data_folder, "RT - Southern Housing Group - JJC.xlsx"),
|
||||
os.path.join(data_folder, "RT - SOUTHERN OUTCOMES - SCIS Merged.xlsx"),
|
||||
]
|
||||
outcomes_sheetname = ["Feedback", "Collated"]
|
||||
outcomes_postcode = ["Poscode", "Postcode"]
|
||||
outcomes_houseno = ["No.", "No"]
|
||||
outcomes_id = ["UPRNs", None]
|
||||
outcomes_address = ["Address", "Address"]
|
||||
master_filepaths = [
|
||||
data_folder, "southern_submissions/CAVITY'S - DECEMBER 2018-Table 1.csv",
|
||||
data_folder, "southern_submissions/CAVITY'S 2019-Table 1.csv",
|
||||
data_folder, "CAVITY'S ECO4-Table 1.csv",
|
||||
data_folder, "LOFT'S-Table 1.csv",
|
||||
]
|
||||
master_to_asset_list_filepath = None
|
||||
phase = False
|
||||
ecosurv_landlords = None
|
||||
ecosurv_landlords = "southern"
|
||||
asset_list_header = 0
|
||||
landlord_block_reference = None
|
||||
master_id_colnames = []
|
||||
|
||||
# NCHA
|
||||
# data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA"
|
||||
# data_filename = "Energy Information MASTER June 2025.xlsx"
|
||||
# sheet_name = "Data"
|
||||
# postcode_column = 'Postcode'
|
||||
# fulladdress_column = "Address"
|
||||
# address1_column = None
|
||||
# address1_method = "house_number_extraction"
|
||||
# address_cols_to_concat = []
|
||||
# missing_postcodes_method = None
|
||||
# landlord_year_built = "Build Date (HAR10)"
|
||||
# landlord_os_uprn = None
|
||||
# landlord_property_type = "Property Type (HAR10)"
|
||||
# landlord_built_form = "Build Form (EPC)"
|
||||
# landlord_wall_construction = "Wall Description"
|
||||
# landlord_roof_construction = None
|
||||
# landlord_heating_system = "HEAT Code"
|
||||
# landlord_existing_pv = None
|
||||
# landlord_property_id = "Place ref"
|
||||
# landlord_sap = "EPC SAP"
|
||||
# outcomes_filename = None
|
||||
# outcomes_sheetname = None
|
||||
# outcomes_postcode = None
|
||||
# outcomes_houseno = None
|
||||
# outcomes_id = None
|
||||
# outcomes_address = None
|
||||
# master_filepaths = []
|
||||
# master_to_asset_list_filepath = None
|
||||
# phase = False
|
||||
# ecosurv_landlords = None
|
||||
# asset_list_header = 0
|
||||
# landlord_block_reference = None
|
||||
# master_id_colnames = []
|
||||
|
||||
# data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Calico"
|
||||
# data_filename = "07.04 CALICO - Final List.xlsx"
|
||||
# asset_list_header = 2
|
||||
|
|
|
|||
|
|
@ -6,7 +6,10 @@ epc-api-python==1.0.2
|
|||
thefuzz
|
||||
boto3
|
||||
openpyxl
|
||||
openai
|
||||
openai>=1.3.5
|
||||
tiktoken
|
||||
msgpack
|
||||
beautifulsoup4
|
||||
beautifulsoup4
|
||||
pydantic>=1.10.7
|
||||
typing-extensions>=4.5.0
|
||||
requests>=2.28.2
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue