Model/asset_list/AssetList.py

import os
import usaddress
import pandas as pd
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc

logger = setup_logger()


class AssetList:
    """
    This class is used to standardise asset lists so that we can process the core information in a consistent manner.
    """

    # These are the accepted methods we have for cleaning the address1 column
    ADDRESS_1_CLEANING_METHODS = [
        "first_two_words",  # This method will split on the fist two words, where the separator is a space
        "first_word",  # This method will split on the first word, where the separator is a space
        "house_number_extraction",  # This method will use the NLP model in SearchEPC to extract the housenumber
        # "address1_extraction"  # This method will use the NLP model to extract address1
    ]

    STANDARD_PROPERTY_TYPES = [
        "house",
        "flat",
        "bungalow",
        "maisonette",
        "park home",
        "block house",
    ]

    # Standard column Names
    STANDARD_ADDRESS_1 = "domna_address_1"
    STANDARD_POSTCODE = "domna_postcode"
    STANDARD_FULL_ADDRESS = "domna_full_address"
    STANDARD_YEAR_BUILT = "domna_year_built"
    STANDARD_UPRN = "ordnance_survey_uprn"
    STANDARD_PROPERTY_TYPE = "landlord_property_type"
    STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
    STANDARD_HEATING_SYSTEM = "landlord_heating_system"
    STANDARD_EXISTING_PV = "landlord_existing_pv"

    DOMNA_PROPERTY_ID = "domna_property_id"

    def __init__(
        self,
        local_filepath,
        sheet_name,
        address1_colname,
        postcode_colname,
        full_address_colname,
        landlord_property_id=None,
        full_address_cols_to_concat=None,
        missing_postcodes_method=None,
        address1_extraction_method=None,
        landlord_year_built=None,
        landlord_uprn=None,
        landlord_property_type=None,
        landlord_wall_construction=None,
        landlord_heating_system=None,
        landlord_existing_pv=None,
        header=0
    ):
        self.local_filepath = local_filepath
        self.sheet_name = sheet_name
        # Read in the data
        self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
        self.standardised_asset_list = self.raw_asset_list.copy()

        # We detect the presence of the non-intrusive columns
        self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False

        # Names of columns
        self.landlord_property_id = landlord_property_id
        self.address1_colname = address1_colname
        self.postcode_colname = postcode_colname
        self.full_address_colname = full_address_colname
        self.landlord_year_built = landlord_year_built
        self.landlord_uprn = landlord_uprn
        self.landlord_property_type = landlord_property_type
        self.landlord_wall_construction = landlord_wall_construction
        self.landlord_heating_system = landlord_heating_system
        self.landlord_existing_pv = landlord_existing_pv

        # parameters for cleaning
        self.full_address_cols_to_concat = full_address_cols_to_concat
        self.missing_postcodes_method = missing_postcodes_method
        self.address1_extraction_method = address1_extraction_method

        self.debug_information = {
            "property_type": None,
            "wall_construction": None,
            "heating_system": None,
            "existing_pv": None
        }

    def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):

        if method not in self.ADDRESS_1_CLEANING_METHODS:
            raise ValueError(f"Method {method} for producing address1 not recognized")

        if method == "first_two_words":
            asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
            return asset_list

        if method == "first_word":
            asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
            return asset_list

        if method == "house_number_extraction":
            asset_list[self.address1_colname] = asset_list.apply(
                lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
                axis=1
            )
            return asset_list

        raise ValueError(f"Method {method} not recognized")

    @staticmethod
    def _address1_extraction(x):
        pass

    def create_property_id(self):
        """
        This function creates the domna property ID, which is simply a hash of the full address and postcode
        We want all figures to be positive
        :return:
        """
        import sys
        self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
            self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
            self.postcode_colname]
        ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)

    @staticmethod
    def _strip_postcode_from_full_address(full_address, postcode):
        cleaned = full_address.replace(postcode, "")
        # Remove any trailing commas and spaces
        cleaned = cleaned.rstrip(", ").strip(",").strip()
        return cleaned

    def standardise(self):
        """
        This function is used to standardise the asset list
        :return: standardised asset list
        """

        # Remove rows without a postcode
        if self.postcode_colname is not None:
            self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])

        # We clean up portential non-breaking spaces, and double spaces
        for col in [
            c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
            c is not None
        ]:
            self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
            self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
            self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('  ', ' ', regex=False)

        if self.address1_colname is None:
            if self.address1_extraction_method is None:
                raise ValueError("Missing address 1 - please specify an extraction method")
            self.address1_colname = self.STANDARD_ADDRESS_1
            # If we do not have this, we produce it
            self.standardised_asset_list = self._extract_address1(
                asset_list=self.standardised_asset_list,
                full_address_col=self.full_address_colname,
                postcode_col=self.postcode_colname,
                method=self.address1_extraction_method
            )

        if self.full_address_colname is None:
            if not self.full_address_cols_to_concat:
                raise ValueError("Missing full address - please specify columns to concatenate")
            self.full_address_colname = self.STANDARD_FULL_ADDRESS
            self.standardised_asset_list[self.full_address_colname] = (
                self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1)
            )
        else:

            # Make sure to strip the postcode out of the full address
            self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
                lambda x: self._strip_postcode_from_full_address(
                    full_address=x[self.full_address_colname],
                    postcode=x[self.postcode_colname]
                ),
                axis=1
            )

        # We create the domna property id
        self.create_property_id()

        # We keep just the columns we care about and will work through the various columns and standardise
        self.standardised_asset_list = self.standardised_asset_list[
            [
                self.landlord_property_id,
                self.DOMNA_PROPERTY_ID,
                self.address1_colname,
                self.postcode_colname,
                self.full_address_colname,
                self.landlord_year_built,
                self.landlord_uprn,
                self.landlord_property_type,
            ]
        ]

        raise NotImplementedError