mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
208 lines
8.2 KiB
Python
208 lines
8.2 KiB
Python
import os
|
|
import usaddress
|
|
import pandas as pd
|
|
from utils.logger import setup_logger
|
|
from backend.SearchEpc import SearchEpc
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
class AssetList:
|
|
"""
|
|
This class is used to standardise asset lists so that we can process the core information in a consistent manner.
|
|
"""
|
|
|
|
# These are the accepted methods we have for cleaning the address1 column
|
|
ADDRESS_1_CLEANING_METHODS = [
|
|
"first_two_words", # This method will split on the fist two words, where the separator is a space
|
|
"first_word", # This method will split on the first word, where the separator is a space
|
|
"house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber
|
|
# "address1_extraction" # This method will use the NLP model to extract address1
|
|
]
|
|
|
|
STANDARD_PROPERTY_TYPES = [
|
|
"house",
|
|
"flat",
|
|
"bungalow",
|
|
"maisonette",
|
|
"park home",
|
|
"block house",
|
|
]
|
|
|
|
# Standard column Names
|
|
STANDARD_ADDRESS_1 = "domna_address_1"
|
|
STANDARD_POSTCODE = "domna_postcode"
|
|
STANDARD_FULL_ADDRESS = "domna_full_address"
|
|
STANDARD_YEAR_BUILT = "domna_year_built"
|
|
STANDARD_UPRN = "ordnance_survey_uprn"
|
|
STANDARD_PROPERTY_TYPE = "landlord_property_type"
|
|
STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
|
|
STANDARD_HEATING_SYSTEM = "landlord_heating_system"
|
|
STANDARD_EXISTING_PV = "landlord_existing_pv"
|
|
|
|
DOMNA_PROPERTY_ID = "domna_property_id"
|
|
|
|
def __init__(
|
|
self,
|
|
local_filepath,
|
|
sheet_name,
|
|
address1_colname,
|
|
postcode_colname,
|
|
full_address_colname,
|
|
landlord_property_id=None,
|
|
full_address_cols_to_concat=None,
|
|
missing_postcodes_method=None,
|
|
address1_extraction_method=None,
|
|
landlord_year_built=None,
|
|
landlord_uprn=None,
|
|
landlord_property_type=None,
|
|
landlord_wall_construction=None,
|
|
landlord_heating_system=None,
|
|
landlord_existing_pv=None,
|
|
header=0
|
|
):
|
|
self.local_filepath = local_filepath
|
|
self.sheet_name = sheet_name
|
|
# Read in the data
|
|
self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
|
|
self.standardised_asset_list = self.raw_asset_list.copy()
|
|
|
|
# We detect the presence of the non-intrusive columns
|
|
self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
|
|
|
|
# Names of columns
|
|
self.landlord_property_id = landlord_property_id
|
|
self.address1_colname = address1_colname
|
|
self.postcode_colname = postcode_colname
|
|
self.full_address_colname = full_address_colname
|
|
self.landlord_year_built = landlord_year_built
|
|
self.landlord_uprn = landlord_uprn
|
|
self.landlord_property_type = landlord_property_type
|
|
self.landlord_wall_construction = landlord_wall_construction
|
|
self.landlord_heating_system = landlord_heating_system
|
|
self.landlord_existing_pv = landlord_existing_pv
|
|
|
|
# parameters for cleaning
|
|
self.full_address_cols_to_concat = full_address_cols_to_concat
|
|
self.missing_postcodes_method = missing_postcodes_method
|
|
self.address1_extraction_method = address1_extraction_method
|
|
|
|
self.debug_information = {
|
|
"property_type": None,
|
|
"wall_construction": None,
|
|
"heating_system": None,
|
|
"existing_pv": None
|
|
}
|
|
|
|
def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
|
|
|
|
if method not in self.ADDRESS_1_CLEANING_METHODS:
|
|
raise ValueError(f"Method {method} for producing address1 not recognized")
|
|
|
|
if method == "first_two_words":
|
|
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
|
|
return asset_list
|
|
|
|
if method == "first_word":
|
|
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
|
|
return asset_list
|
|
|
|
if method == "house_number_extraction":
|
|
asset_list[self.address1_colname] = asset_list.apply(
|
|
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
|
|
axis=1
|
|
)
|
|
return asset_list
|
|
|
|
raise ValueError(f"Method {method} not recognized")
|
|
|
|
@staticmethod
|
|
def _address1_extraction(x):
|
|
pass
|
|
|
|
def create_property_id(self):
|
|
"""
|
|
This function creates the domna property ID, which is simply a hash of the full address and postcode
|
|
We want all figures to be positive
|
|
:return:
|
|
"""
|
|
import sys
|
|
self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
|
|
self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
|
|
self.postcode_colname]
|
|
).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
|
|
|
|
@staticmethod
|
|
def _strip_postcode_from_full_address(full_address, postcode):
|
|
cleaned = full_address.replace(postcode, "")
|
|
# Remove any trailing commas and spaces
|
|
cleaned = cleaned.rstrip(", ").strip(",").strip()
|
|
return cleaned
|
|
|
|
def standardise(self):
|
|
"""
|
|
This function is used to standardise the asset list
|
|
:return: standardised asset list
|
|
"""
|
|
|
|
# Remove rows without a postcode
|
|
if self.postcode_colname is not None:
|
|
self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])
|
|
|
|
# We clean up portential non-breaking spaces, and double spaces
|
|
for col in [
|
|
c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
|
|
c is not None
|
|
]:
|
|
self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
|
|
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
|
|
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False)
|
|
|
|
if self.address1_colname is None:
|
|
if self.address1_extraction_method is None:
|
|
raise ValueError("Missing address 1 - please specify an extraction method")
|
|
self.address1_colname = self.STANDARD_ADDRESS_1
|
|
# If we do not have this, we produce it
|
|
self.standardised_asset_list = self._extract_address1(
|
|
asset_list=self.standardised_asset_list,
|
|
full_address_col=self.full_address_colname,
|
|
postcode_col=self.postcode_colname,
|
|
method=self.address1_extraction_method
|
|
)
|
|
|
|
if self.full_address_colname is None:
|
|
if not self.full_address_cols_to_concat:
|
|
raise ValueError("Missing full address - please specify columns to concatenate")
|
|
self.full_address_colname = self.STANDARD_FULL_ADDRESS
|
|
self.standardised_asset_list[self.full_address_colname] = (
|
|
self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1)
|
|
)
|
|
else:
|
|
|
|
# Make sure to strip the postcode out of the full address
|
|
self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
|
|
lambda x: self._strip_postcode_from_full_address(
|
|
full_address=x[self.full_address_colname],
|
|
postcode=x[self.postcode_colname]
|
|
),
|
|
axis=1
|
|
)
|
|
|
|
# We create the domna property id
|
|
self.create_property_id()
|
|
|
|
# We keep just the columns we care about and will work through the various columns and standardise
|
|
self.standardised_asset_list = self.standardised_asset_list[
|
|
[
|
|
self.landlord_property_id,
|
|
self.DOMNA_PROPERTY_ID,
|
|
self.address1_colname,
|
|
self.postcode_colname,
|
|
self.full_address_colname,
|
|
self.landlord_year_built,
|
|
self.landlord_uprn,
|
|
self.landlord_property_type,
|
|
]
|
|
]
|
|
|
|
raise NotImplementedError
|