Model/etl/ownership/Ownership.py

from datetime import datetime
from typing import List
from tqdm import tqdm
import pandas as pd
import Levenshtein
import re
from utils.s3 import save_excel_to_s3, read_excel_from_s3
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc
from etl.spatial.OpenUprnClient import OpenUprnClient

logger = setup_logger()


class Ownership:
    # These are a number of prefix phrases, found in the ownership data. If an address begins with a any of these
    # terms, we remove them
    OWNERSHIP_STARTING_TERMS = [
        "land adjoining", "land on the", "land to the rear of", "land and buildings on the",
        "garage adjoining", "car park adjoining", "the land adjoining", "land and buildings adjoining",
        "all royal mines"
    ]

    # anything that is sold within this many months is flagged to have sold recently and is then
    # considered to be dropped from matching
    SOLD_RECENTLY_MONTHS = 12

    # Anything that has been lodged for a marketed or unmarketed sale within this many months is
    # flagged as potentially in the process of being sold
    LODGED_RECENTLY_MONTHS = 12

    # These are the columns in the land registry data
    LAND_REGISTRY_COLUMNS = [
        "transaction_id",
        "price",
        "date_of_transfer",
        "postcode",
        "property_type",
        "old_new",
        "duration",
        "paon",
        "saon",
        "street",
        "locality",
        "town_city",
        "district",
        "county",
        "ppd_category_type",
        "record_status",
    ]

    def __init__(
        self,
        epc_paths: List[str],
        domestic_ownership_path: str,
        overseas_ownership_path: str,
        land_registry_path: str,
        project_name: str,
        bucket: str,
        average_property_value: float,
        portfolio_value: float,
        excluded_owners: List[str] = None,
        excluded_uprns: List[int] = None,
        save=True
    ):
        """

        :param epc_paths: A list of strings, which points to the location of the EPC data to be used. TO date, this
                          data has been held locally, and so will require extension to read from remote locaations like
                          s3
        :param domestic_ownership_path: A string which points to the location of the CCOD ownership data, that details
                                        corporate ownership of properties in the UK, where the companies are UK based
        :param overseas_ownership_path: A string which points to the location of the OCOD ownership data, that details
                                        corporate ownership of properties in the UK, where the companies are overseas
        :param land_registry_path: A string that points to the location of the land registry data
        :param project_name: A string that is used to identify the project
        :param bucket: The name of the s3 bucket where the data will be stored
        :param average_property_value: The average property value in the area
        """

        # All epc paths should end with certificates.csv
        if not any(path for path in epc_paths if path.endswith("certificates.csv")):
            raise ValueError("epc_paths contains a path that does not end with certificates.csv")
        self.epc_paths = epc_paths
        self.domestic_ownership_path = domestic_ownership_path
        self.overseas_ownership_path = overseas_ownership_path
        self.land_registry_path = land_registry_path

        self.excluded_owners = [] if excluded_owners is None else excluded_owners
        self.excluded_uprns = [] if excluded_uprns is None else excluded_uprns

        self.run_timestamp = str(datetime.now())
        self.project_name = project_name
        self.bucket = bucket

        self.average_property_value = average_property_value
        self.portfolio_value = portfolio_value

        # Data storage paths
        self.epc_data_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/epc_data.xlsx"
        self.filtered_land_registry_filepath = (
            f"ownership/{self.project_name}/{self.run_timestamp}/filtered_land_registry.xlsx"
        )
        self.matched_addresses_pre_filter_filepath = (
            f"ownership/{self.project_name}/{self.run_timestamp}/matched_addresses_pre_filter.xlsx"
        )
        self.combined_matching_lookup_pre_filter_filepath = (
            f"ownership/{self.project_name}/{self.run_timestamp}/combined_matching_lookup_pre_filter.xlsx"
        )
        # Final output paths
        self.portfolio_owners_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_owners.xlsx"
        self.portfolio_properties_filepath = (
            f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_properties.xlsx"
        )
        self.portfolio_epc_data_filepath = (
            f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx"
        )

        self.save = save

        # Data
        self.epc_data = None
        self.ownership_data = None
        self.freehold_matching_lookup = None
        self.leasehold_matching_lookup = None
        self.shared_freehold_match = None
        self.shared_leasehold_match = None
        self.land_registry = None

        # Match tables
        self.combined_matching_lookup = None
        self.matched_addresses = None
        self.land_registry_matches = None

        # Final outputs data
        self.portfolio_owners = None
        self.portfolio_properties = None
        self.portfolio_epc_data = None

    def pipeline(self, column_filters=None):
        """
        Runs the full ownership process
        :param column_filters: Dictionary with column names as keys and list of acceptable values as values. This
                                 dictionary is is used to filter the EPC data and should look like this:
                                {"column_name": ["value1", "value2", ...]}, where column_name is the name of the column
                                in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that
                                column. If a column is not found in the EPC data, an exception is raised.
        """
        # Step 1: Get EPC data
        self.source_epc_properties(column_filters=column_filters)

        # Step 2: Get company ownership data
        self.load_company_ownership()

        # Step 3: Prepare data for matching
        self.prepare_for_matching()

        # Step 4: Match EPC data to ownership data
        self.match()

        # Step 5: Match land registry data to existing matches
        self.match_with_land_registry()
        # We store this data in s3 before we perform any filtering
        if self.save:
            save_excel_to_s3(
                df=self.matched_addresses,
                bucket_name=self.bucket,
                file_key=self.matched_addresses_pre_filter_filepath
            )
            save_excel_to_s3(
                df=self.combined_matching_lookup,
                bucket_name=self.bucket,
                file_key=self.combined_matching_lookup_pre_filter_filepath
            )

        # Prepare the final outputs:
        self.create_final_matches()

    def source_epc_properties(self, column_filters=None, postcodes=None):
        """
        This function will filter the epc data as specified by column filters, searching across all of the EPC tables
        :param column_filters: Dictionary with column names as keys and list of acceptable values as values. This
                                 dictionary is is used to filter the EPC data and should look like this:
                                {"column_name": ["value1", "value2", ...]}, where column_name is the name of the column
                                in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that
                                column. If a column is not found in the EPC data, an exception is raised.
        :param postcodes: A list of postcodes to filter the data on
        """

        column_filters = {} if column_filters is None else column_filters

        data = []
        for path in tqdm(self.epc_paths):
            epc_data = pd.read_csv(path, low_memory=False)
            epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
            epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str)

            if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum():
                raise Exception("Lodgement datetime contains invalid data")

            epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")
            epc_data = epc_data.sort_values(["LODGEMENT_DATETIME"], ascending=False).drop_duplicates("UPRN")

            # Apply column filters
            for column, values in column_filters.items():
                if column in epc_data.columns:
                    epc_data = epc_data[epc_data[column].isin(values)]
                else:
                    raise Exception(f"Column {column} not found in data. column_filters is malformed")

            if postcodes is not None:
                epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes)]
            if epc_data.empty:
                continue

            data.append(epc_data)

        self.epc_data = pd.concat(data, ignore_index=True)

        if self.excluded_uprns:
            self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)]

        if self.save:
            # We now store the data in s3
            save_excel_to_s3(
                df=self.epc_data,
                bucket_name=self.bucket,
                file_key=self.epc_data_filepath
            )

    def load_company_ownership(self):
        """
        This function reads in the company ownership data and
        :return:
        """
        logger.info("Reading in company ownership data")
        self.ownership_data = pd.read_csv(self.domestic_ownership_path)
        self.ownership_data["is_overseas"] = False
        overseas_company_ownership = pd.read_csv(self.overseas_ownership_path)
        overseas_company_ownership["is_overseas"] = True

        self.ownership_data = pd.concat([self.ownership_data, overseas_company_ownership])

        # FIlter on relevant postcodes - this is done to reduce the large size of the ownership dataset
        logger.info("Filtering ownership data on EPC postcodes")
        self.ownership_data = self.ownership_data[
            self.ownership_data["Postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique())
        ]

        logger.info("Removing excluded owners")
        # Use the company registration number to filter out excluded owners
        self.ownership_data = self.ownership_data[
            ~self.ownership_data["Company Registration No. (1)"].astype(str).isin(self.excluded_owners)
        ]

    def prepare_for_matching(self):
        """
        Given the epc properties and the ownership data, this function performs a number of operations on both datasets
        to prepare them for matching
        """

        logger.info("Preparing data for matching")
        # Now we filter properties the other way around, since the ownership data might not have all of the
        # postcodes that appear in the EPC data
        self.epc_data = self.epc_data[
            self.epc_data["POSTCODE"].str.lower().isin(self.ownership_data["Postcode"].str.lower().unique())
        ]
        # We have some duplicated on UPRN
        # Take the newest UPRN
        self.epc_data = self.epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")

        # Remove entries where the address begins with the term "land adjoining", or other records that don't
        # reference the
        # the property itself

        for starting_term in self.OWNERSHIP_STARTING_TERMS:
            self.ownership_data = self.ownership_data[
                ~self.ownership_data["Property Address"].str.lower().str.startswith(starting_term)
            ]

    @staticmethod
    def extract_numeric_part(house_number: str) -> str:
        """
        Extracts only the numeric part from a house number that may contain letters.

        Parameters:
        - house_number (str): The house number string possibly containing letters.

        Returns:
        - str: The numeric part of the house number.
        """
        # Use regular expression to replace all non-digit characters with nothing
        numeric_part = re.sub(r'\D', '', house_number)
        return numeric_part

    @staticmethod
    def remove_text_in_brackets(address: str) -> str:
        """
        Removes any text within parentheses, including the parentheses themselves.

        Parameters:
        - address (str): The address string to clean.

        Returns:
        - str: The cleaned address with text in parentheses removed.
        """
        # Regex to find and remove content in parentheses
        cleaned_address = re.sub(r'\s*\([^)]*\)', '', address)
        return cleaned_address

    @staticmethod
    def extract_range_from_house_number(house_number_range: str):
        """
        Detects if the house number includes a numeric range (formatted as 'x-y') and extracts all values within this
        range.
        Non-numeric strings containing hyphens are ignored.

        Parameters:
        - house_number_range (str): The house number string that might contain a range.

        Returns:
        - list of str: A list of all numbers within the range if it is a range; otherwise, returns None.
        """

        if not house_number_range:
            return None

        if '-' in house_number_range:
            parts = house_number_range.split('-')
            if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
                # Both parts are numeric, so it's a valid range
                start, end = map(int, parts)  # Convert parts to integers
                return [str(x) for x in range(start, end + 1)]
            else:
                # Not a valid numeric range
                return None
        else:
            # No hyphen present or not a range
            return None

    @staticmethod
    def is_in_range(row, house_no):
        """ Check if the house number is within the range provided in the row. """
        if row and any(house_no == num for num in row):
            return True
        return False

    @staticmethod
    def levenstein_match(matching_string, df, address_col):
        match_to = df[address_col].tolist()
        # Strip out punctuation and spaces
        match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
        match_to = [x.replace(" ", "") for x in match_to]

        # Perform matching between full key and match_to
        distances = [Levenshtein.distance(matching_string, s) for s in match_to]
        best_match_index = distances.index(min(distances))
        # We might want to consider a threshold for the distance, however for the momeny,
        # we don't consider this for the moment
        df = df.iloc[best_match_index:best_match_index + 1]

        return df

    @classmethod
    def remove_duplicate_matches(cls, matching_lookup, properties, company_ownership):
        duplicated_titles = matching_lookup[matching_lookup["Title Number"].duplicated()]["Title Number"].unique()

        to_drop = []
        for dupe_title in duplicated_titles:
            dupe_data = matching_lookup[matching_lookup["Title Number"] == dupe_title].copy()
            matched_addresses = dupe_data.merge(
                properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}),
                how="left", on="UPRN"
            ).merge(
                company_ownership[["Title Number", "Property Address"]],
                how="left", on="Title Number"
            )
            # We perform levenstein to get the best match
            best_match = cls.levenstein_match(
                matching_string=matched_addresses["Property Address"].values[0],
                df=matched_addresses,
                address_col="epc_address"
            )
            matches_to_drop = matched_addresses[
                ~matched_addresses["UPRN"].isin(best_match["UPRN"].values)
            ]

            to_drop.append(
                matches_to_drop[["UPRN", "Title Number"]].copy()
            )

        to_drop = pd.concat(to_drop) if to_drop else pd.DataFrame()

        if not to_drop.empty:
            merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True)
            merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])

            return merged

        return matching_lookup

    @classmethod
    def remove_duplicate_uprn_matches(cls, matching_lookup, properties, company_ownership):
        dupe_uprns = matching_lookup[matching_lookup["UPRN"].duplicated()]["UPRN"].unique().tolist()

        to_drop = []
        for dupe_uprn in dupe_uprns:
            dupe_data = matching_lookup[matching_lookup["UPRN"] == dupe_uprn].copy()
            matched_addresses = dupe_data.merge(
                properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}),
                how="left", on="UPRN"
            ).merge(
                company_ownership[["Title Number", "Property Address"]],
                how="left", on="Title Number"
            )
            # We perform levenstein to get the best match
            best_match = cls.levenstein_match(
                matching_string=matched_addresses["Property Address"].values[0],
                df=matched_addresses,
                address_col="epc_address"
            )
            matches_to_drop = matched_addresses[
                ~matched_addresses["Title Number"].isin(best_match["Title Number"].values)
            ]

            to_drop.append(
                matches_to_drop[["UPRN", "Title Number"]].copy()
            )

        to_drop = pd.concat(to_drop)

        if not to_drop.empty:
            merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True)
            merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])

            return merged

        return matching_lookup

    @staticmethod
    def is_substring(x, match_string):
        if pd.isnull(x):
            return False
        return x in match_string.lower()

    @staticmethod
    def house_number_match(paon, house_number):
        # Firstly try and convert to numberic
        try:
            paon_numeric = int(paon)
            house_number_numeric = int(house_number)
            return paon_numeric == house_number_numeric
        except Exception as e:  # noqa
            # If we can't convert both to numeric, we do an equality

            return paon == house_number

    @staticmethod
    def check_equalities(lr_filtered):
        all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0])
        if pd.isnull(lr_filtered["saon"].values[0]):
            all_saon_equal = all(pd.isnull(lr_filtered["saon"]))
        else:
            all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0])

        all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0])

        return all_paon_equal, all_saon_equal, all_street_equal

    def match(self):
        if (self.epc_data is None) or (self.ownership_data is None):
            raise ValueError("epc_data and ownership_data should not be null")

        logger.info("Matching EPC data to ownership data")
        freehold_matching_lookup = []
        leasehold_matching_lookup = []
        shared_leasehold_match = []
        shared_freehold_match = []
        for _, address in tqdm(self.epc_data.iterrows(), total=len(self.epc_data)):
            match_type = "exact"
            filtered = self.ownership_data[
                self.ownership_data["Postcode"].str.lower() == address["POSTCODE"].lower()
                ].copy()

            # Remove postcode and remove trailing commas
            filtered["house_number"] = (
                filtered["Property Address"]
                .apply(self.remove_text_in_brackets)
                .apply(SearchEpc.get_house_number)
                .str.lower()
                .str.replace(",", "")
            )
            house_no = SearchEpc.get_house_number(address["ADDRESS1"])
            if house_no is not None:
                house_no = house_no.replace(",", "")

            if house_no is None:
                # If the house number is missing, it means that we usually have a named property so we look for an
                # exact match on that name
                filtered = filtered[filtered["Property Address"].str.lower().str.contains(address["ADDRESS"].lower())]
                if filtered.shape[0] != 1:
                    continue

            else:

                if house_no not in filtered["house_number"].values:
                    # If this happens, we check house_number for a x-y range of addresses
                    filtered["house_number_range"] = filtered["house_number"].apply(
                        self.extract_range_from_house_number
                    )
                    # If we have found a house number range, we check if the house number is in the range and if not,
                    # we drop the row
                    filtered['is_in_range'] = filtered['house_number_range'].apply(
                        lambda x: self.is_in_range(x, house_no)
                    )

                    if filtered['is_in_range'].any():
                        # If house_no is found in any range, keep only rows where it is in range
                        filtered = filtered[filtered['is_in_range']]
                    else:
                        # If house_no is not found in any range, filter out rows where 'house_number_range' is not None
                        filtered = filtered[filtered['house_number_range'].isnull()]

                    # Strip out letters from house_no and house_number
                    house_no = self.extract_numeric_part(house_no)
                    filtered["house_number"] = filtered["house_number"].astype(str).apply(self.extract_numeric_part)
                    match_type = "approximate"

                filtered = filtered[filtered["house_number"] == house_no]

            if filtered.empty:
                continue

            filtered_freehold = filtered[filtered["Tenure"] == "Freehold"]
            filtered_leasehold = filtered[filtered["Tenure"] == "Leasehold"]

            if filtered_freehold.shape[0] > 1:
                matched = filtered_leasehold[["Title Number"]].copy()
                matched.insert(0, "UPRN", address["UPRN"])
                shared_freehold_match.append(matched)
            elif not filtered_freehold.empty:
                freehold_matching_lookup.append(
                    {
                        "UPRN": address["UPRN"],
                        "Title Number": filtered_freehold["Title Number"].values[0],
                        "match_type": match_type,
                    }
                )

            if filtered_leasehold.shape[0] > 1:
                matched = filtered_leasehold[["Title Number"]].copy()
                matched.insert(0, "UPRN", address["UPRN"])
                shared_leasehold_match.append(matched)
            elif not filtered_leasehold.empty:
                leasehold_matching_lookup.append(
                    {
                        "UPRN": address["UPRN"],
                        "Title Number": filtered_leasehold["Title Number"].values[0],
                        "match_type": match_type,
                    }
                )

        logger.info("Matching complete - creating lookup tables")

        self.freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup)
        self.leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup)

        self.freehold_matching_lookup = self.freehold_matching_lookup[
            self.freehold_matching_lookup["match_type"] == "exact"
            ]
        self.leasehold_matching_lookup = self.leasehold_matching_lookup[
            self.leasehold_matching_lookup["match_type"] == "exact"
            ]

        self.shared_leasehold_match = shared_leasehold_match
        self.shared_freehold_match = shared_freehold_match

        # finally, we create matched addresses
        self.combined_matching_lookup = pd.concat([self.freehold_matching_lookup, self.leasehold_matching_lookup])

        # Remove duplicates
        self.combined_matching_lookup = self.remove_duplicate_matches(
            matching_lookup=self.combined_matching_lookup,
            properties=self.epc_data,
            company_ownership=self.ownership_data
        )
        # We also have duplicates at a UPRN level
        self.combined_matching_lookup = self.remove_duplicate_uprn_matches(
            matching_lookup=self.combined_matching_lookup,
            properties=self.epc_data,
            company_ownership=self.ownership_data
        )

        self.matched_addresses = self.combined_matching_lookup.merge(
            self.epc_data[
                [
                    "UPRN",
                    "ADDRESS",
                    "ADDRESS1",
                    "CURRENT_ENERGY_EFFICIENCY",
                    "CURRENT_ENERGY_RATING",
                    "POSTCODE",
                    "LODGEMENT_DATE",
                    "TRANSACTION_TYPE",
                    "TENURE",
                ]
            ].rename(
                columns={
                    "ADDRESS": "epc_address",
                    "ADDRESS1": "epc_address1",
                    "POSTCODE": "epc_postcode"
                }
            ),
            how="left", on="UPRN"
        ).merge(
            self.ownership_data[
                [
                    "Title Number",
                    "Property Address",
                    "Postcode",
                    "Company Registration No. (1)",
                    "Proprietor Name (1)",
                    "Date Proprietor Added",
                ]
            ],
            how="left", on="Title Number"
        )

        # Let's try and get the house number
        self.matched_addresses["house_number"] = (
            self.matched_addresses["epc_address"]
            .apply(self.remove_text_in_brackets)
            .apply(SearchEpc.get_house_number)
            .str.lower()
            .str.replace(",", "")
        )

        logger.info("Successfully completed matching")

    def get_land_registry(self):
        """
        This function reads in the land registry data and filters it on the postcodes found in the EPC data
        """
        land_registry = pd.read_csv(self.land_registry_path, header=None)
        land_registry.columns = self.LAND_REGISTRY_COLUMNS
        land_registry = land_registry[
            land_registry["postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique())
        ]
        land_registry["date_of_transfer"] = pd.to_datetime(
            land_registry["date_of_transfer"], format="%Y-%m-%d", errors="coerce"
        )
        # Take data from the last 5 years
        land_registry = land_registry[
            (land_registry["date_of_transfer"] >= datetime.now() - pd.DateOffset(years=5))
        ]

        return land_registry

    def match_with_land_registry(self):
        """
        This function matches the land registry data to the existing matches
        :return:
        """
        # TODO: Refactor this entire function
        if self.matched_addresses is None:
            raise ValueError("Run match() first!")

        logger.info("Reading land registry data")
        self.land_registry = self.get_land_registry()
        # Store this fitereed version in s3
        save_excel_to_s3(
            df=self.land_registry,
            bucket_name=self.bucket,
            file_key=self.filtered_land_registry_filepath,
        )

        for col in ["postcode", "street", "paon", "saon"]:
            self.land_registry[col] = self.land_registry[col].str.lower().str.strip()

        self.land_registry["date_of_transfer"] = pd.to_datetime(self.land_registry["date_of_transfer"])

        logger.info("Performing land registry matching")
        land_registry_matches = []
        for _, match in tqdm(self.matched_addresses.iterrows(), total=len(self.matched_addresses)):
            # Filter land registry on the postcode
            lr_filtered = self.land_registry[
                (self.land_registry["postcode"] == match["epc_postcode"].lower().strip())
            ].copy()

            # Filter further, when the street is in in the address
            # street should be contained in epc_address
            lr_filtered = lr_filtered[
                lr_filtered["street"].apply(lambda x: self.is_substring(x, match["epc_address"].lower())) |
                lr_filtered["street"].apply(lambda x: self.is_substring(x, match["Property Address"].lower()))
                ]

            if lr_filtered.empty:
                continue

            # We now check if paon is in address 1
            lr_filtered["paon_match"] = lr_filtered["paon"].apply(
                lambda x: self.house_number_match(x, match["house_number"])
            )
            # We also try the secondary match
            lr_filtered["saon_match"] = (
                lr_filtered["saon"].apply(
                    lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address1"])
                )
            )
            # We fileter where we have a primary or secondary match
            lr_filtered = lr_filtered[
                lr_filtered["paon_match"] | lr_filtered["saon_match"]
                ]

            if lr_filtered.empty:
                continue
            elif lr_filtered.shape[0] == 1:
                land_registry_matches.append(
                    {
                        "uprn": match["UPRN"],
                        "transaction_id": lr_filtered['transaction_id'].values[0],
                        "price": lr_filtered["price"].values[0],
                        "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                    }
                )
                continue
            elif lr_filtered.shape[0] > 1:
                # We make sure all records are the same and take the newest
                all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
                has_paon_match = any(lr_filtered["paon_match"])

                if all_paon_equal and all_street_equal and all_saon_equal:
                    # Take the newest record, append and continue
                    lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
                    lr_filtered = lr_filtered.head(1)
                    land_registry_matches.append(
                        {
                            "uprn": match["UPRN"],
                            "transaction_id": lr_filtered['transaction_id'].values[0],
                            "price": lr_filtered["price"].values[0],
                            "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                        }
                    )
                    continue
                elif has_paon_match and all_street_equal:
                    # Peform filter on paon
                    lr_filtered = lr_filtered[lr_filtered["paon_match"]]
                    # Do an addtiioanl equality check
                    all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
                    if all_paon_equal and all_street_equal and all_saon_equal:
                        lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
                        lr_filtered = lr_filtered.head(1)
                        land_registry_matches.append(
                            {
                                "uprn": match["UPRN"],
                                "transaction_id": lr_filtered['transaction_id'].values[0],
                                "price": lr_filtered["price"].values[0],
                                "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                            }
                        )
                    else:
                        # We do a match on saon
                        lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
                            lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address"])
                        )

                        lr_filtered = lr_filtered[lr_filtered["saon_match2"]]

                        if lr_filtered.empty:
                            continue
                        elif lr_filtered.shape[0] == 1:
                            land_registry_matches.append(
                                {
                                    "uprn": match["UPRN"],
                                    "transaction_id": lr_filtered['transaction_id'].values[0],
                                    "price": lr_filtered["price"].values[0],
                                    "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                                }
                            )
                            continue
                        else:
                            raise NotImplementedError("wtf")
                else:
                    # We have a final check, based on an observed case
                    lr_address_1 = " ".join([x.lower().strip() for x in match["Property Address"].split(",")[0:2]])

                    lr_filtered["paon_match2"] = lr_filtered["paon"].apply(
                        lambda x: False if pd.isnull(x) else self.is_substring(x, lr_address_1)
                    )

                    lr_filtered = lr_filtered[lr_filtered["paon_match2"]]

                    if lr_filtered.empty:
                        continue
                    elif lr_filtered.shape[0] == 1:
                        land_registry_matches.append(
                            {
                                "uprn": match["UPRN"],
                                "transaction_id": lr_filtered['transaction_id'].values[0],
                                "price": lr_filtered["price"].values[0],
                                "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                            }
                        )
                        continue
                    else:
                        # Check all the same
                        all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)

                        # Check saon is house number with exact match
                        lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
                            lambda x: False if pd.isnull(x) else self.house_number_match(x, match["house_number"])
                        )
                        # We check if we have a flat
                        match_flat_number = re.match("flat (\d+)", match["epc_address1"].lower())
                        match_apartment_number = re.match("apartment (\d+)", match["epc_address1"].lower())
                        lr_filtered["saon_match3"] = False
                        if match_flat_number is not None:
                            # Get out the match
                            match_flat_number = "flat " + match_flat_number.group(1)
                            lr_filtered["saon_match3"] = lr_filtered["saon"].apply(
                                lambda x: False if pd.isnull(x) else x == match_flat_number
                            )

                        if match_apartment_number is not None:
                            # Get out the match
                            match_apartment_number = "apartment " + match_apartment_number.group(1)
                            lr_filtered["saon_match3"] = lr_filtered["saon"].apply(
                                lambda x: False if pd.isnull(x) else x == match_apartment_number
                            )

                        if all_paon_equal and all_saon_equal and all_street_equal:
                            # Take the newest record
                            lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
                            lr_filtered = lr_filtered.head(1)
                            land_registry_matches.append(
                                {
                                    "uprn": match["UPRN"],
                                    "transaction_id": lr_filtered['transaction_id'].values[0],
                                    "price": lr_filtered["price"].values[0],
                                    "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                                }
                            )
                            continue
                        elif any(lr_filtered["saon_match2"]):
                            lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
                            all_saon_equal, all_paon_equal, all_street_equal = self.check_equalities(lr_filtered)
                            if all_paon_equal and all_saon_equal and all_street_equal:
                                # Filter on the newest record
                                lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
                                lr_filtered = lr_filtered.head(1)
                            if lr_filtered.shape[0] == 1:
                                land_registry_matches.append(
                                    {
                                        "uprn": match["UPRN"],
                                        "transaction_id": lr_filtered['transaction_id'].values[0],
                                        "price": lr_filtered["price"].values[0],
                                        "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                                    }
                                )
                                continue
                        elif any(lr_filtered["saon_match3"]):
                            lr_filtered = lr_filtered[lr_filtered["saon_match3"]]
                            if lr_filtered.shape[0] == 1:
                                land_registry_matches.append(
                                    {
                                        "uprn": match["UPRN"],
                                        "transaction_id": lr_filtered['transaction_id'].values[0],
                                        "price": lr_filtered["price"].values[0],
                                        "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                                    }
                                )
                                continue

                        raise NotImplementedError("wtf")
            else:
                raise NotImplementedError("What happened here?")

        self.land_registry_matches = pd.DataFrame(land_registry_matches)

        logger.info("Sucessfully completed land registry matching - merging onto matched_addresses")
        # Merge onto the EPC - ownership matches
        self.matched_addresses = self.matched_addresses.merge(
            self.land_registry_matches,
            how="left",
            left_on="UPRN",
            right_on="uprn"
        ).drop(columns=["uprn"])

        # Flag anything that sold in the last year
        self.matched_addresses["sold_recently"] = (
            self.matched_addresses["date_of_transfer"] >= pd.Timestamp.now() -
            pd.DateOffset(month=self.SOLD_RECENTLY_MONTHS)
        )

        self.matched_addresses["sale_lodged_recently"] = (
            (
                pd.to_datetime(
                    self.matched_addresses["LODGEMENT_DATE"]
                ) >= pd.Timestamp.now() - pd.DateOffset(months=self.LODGED_RECENTLY_MONTHS)
            ) &
            (self.matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"]))
        )

    def aggregate_matches(self, matching_lookup, company_ownership, properties):
        df = matching_lookup.merge(
            company_ownership, how="left", on="Title Number"
        ).merge(
            properties[["UPRN", "LOCAL_AUTHORITY_LABEL"]], how="left", on="UPRN"
        )
        counts = (
            df.groupby(["Company Registration No. (1)", "LOCAL_AUTHORITY_LABEL"])["UPRN"]
            .count()
            .reset_index(name="number_of_properties")
        )
        counts = counts.sort_values("number_of_properties", ascending=False)

        pivot_counts = counts.pivot_table(
            index=["Company Registration No. (1)"],  # Rows: companies and proprietors
            columns="LOCAL_AUTHORITY_LABEL",  # Columns: each local authority
            values="number_of_properties",  # The counts of properties
            fill_value=0  # Fill missing values with 0 (where there are no properties owned)
        ).reset_index()

        total_counts = (
            df.groupby(["Company Registration No. (1)"])["UPRN"]
            .count()
            .reset_index(name="total_number_of_properties")
        )

        # We have cases where the same company registration number results in the same company name, so we produce a
        # best
        # name per company registration number
        best_names = (
            df.groupby(["Company Registration No. (1)"])["Proprietor Name (1)"]
            .first()
            .reset_index()
        )

        total_counts = best_names.merge(
            total_counts, how="left", on=["Company Registration No. (1)"]
        )

        pivot_counts = pivot_counts.merge(
            total_counts, how="left", on=["Company Registration No. (1)"]
        )

        pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False)
        pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1]

        pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"]
        pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum()

        return pivot_counts

    def create_final_matches(self):
        """
        Given the matching to this point, this method creates the final matching tables
        :return:
        """
        logger.info("Creating final matches")
        matched_addresses_final = self.matched_addresses[
            ~self.matched_addresses["sold_recently"] &
            ~self.matched_addresses["sale_lodged_recently"]
            ].copy()

        logger.info("Performing conservation area and listed/herigage building filtering")

        portfolio_spatial_data = OpenUprnClient.get_spatial_data(
            matched_addresses_final["UPRN"].unique().tolist(), bucket_name="retrofit-data-dev"
        )

        portfolio_spatial_data = portfolio_spatial_data[
            ["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]
        ].copy()
        portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str)

        # Filter matched_addresses_final and filter combined_matching_lookup_final
        matched_addresses_final = matched_addresses_final.merge(
            portfolio_spatial_data, how="left", on="UPRN"
        )
        matched_addresses_final = matched_addresses_final[
            matched_addresses_final["conservation_status"].isin([None, False]) &
            matched_addresses_final["is_listed_building"].isin([None, False]) &
            matched_addresses_final["is_heritage_building"].isin([None, False])
            ]

        # Filter combined_matching_lookup accordingly
        combined_matching_lookup_final = self.combined_matching_lookup[
            self.combined_matching_lookup["UPRN"].isin(matched_addresses_final["UPRN"])
        ]

        # Roll up portfolio
        combined_aggregate = self.aggregate_matches(
            matching_lookup=combined_matching_lookup_final,
            company_ownership=self.ownership_data,
            properties=self.epc_data
        )

        self.portfolio_owners = combined_aggregate[combined_aggregate["cumulative_value"] <= self.portfolio_value]

        self.portfolio_properties = matched_addresses_final[
            matched_addresses_final["Company Registration No. (1)"].isin(
                self.portfolio_owners["Company Registration No. (1)"]
            )
        ]

        # We perform some checks
        if self.portfolio_owners["total_number_of_properties"].sum() != self.portfolio_properties["UPRN"].nunique():
            raise ValueError("Portfolio owners and properties don't match")

        self.portfolio_epc_data = self.epc_data[self.epc_data["UPRN"].isin(self.portfolio_properties["UPRN"])]

        # Additional checks
        if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique():
            raise ValueError("Portfolio properties and epc data don't match")

        if self.save:
            logger.info("Storing final outpus")
            # Store data
            save_excel_to_s3(
                df=self.portfolio_owners,
                bucket_name=self.bucket,
                file_key=self.portfolio_owners_filepath,
            )

            save_excel_to_s3(
                df=self.portfolio_properties,
                bucket_name=self.bucket,
                file_key=self.portfolio_properties_filepath,
            )

            save_excel_to_s3(
                df=self.portfolio_epc_data,
                bucket_name=self.bucket,
                file_key=self.portfolio_epc_data_filepath,
            )

    def get_asset_list(self):
        """
        From the EPC data, creates the asset list
        :return:
        """

        asset_list = self.portfolio_epc_data[["UPRN", "ADDRESS1", "POSTCODE"]].copy().rename(
            columns={
                "UPRN": "uprn",
                "ADDRESS1": "address",
                "POSTCODE": "postcode"
            }
        )

        return asset_list

    def create_final_outputs(self, portfolio_timestamp, storage_date, exclusion_uprns=None):
        """
        Given the completed outputs of the matching process, this function creates the final outputs, after matching
        valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means
        that we can iterate on the portfolio without affecting the final outputs, and then once we're happy with the
        new version, we can commit those files to the "working" directory. This inforamtion shouldn't update very
        often and so we're ok to store this at a daily level
        :return:
        """

        exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns

        # Step 1: Read in the valuations data
        valuations = read_excel_from_s3(
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx",
            header_row=0
        )

        # Load in the portfolio data
        # 1) owners
        portfolio_owners = read_excel_from_s3(
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_owners.xlsx",
            header_row=0
        )
        # 2) EPC
        portfolio_epc_data = read_excel_from_s3(
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_epc_data.xlsx",
            header_row=0
        )

        # 3) properties
        portfolio_properties = read_excel_from_s3(
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_properties.xlsx",
            header_row=0
        )

        # Check they're the right size
        if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique():
            raise ValueError("Portfolio owners and properties don't match")

        if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique():
            raise ValueError("Portfolio properties and epc data don't match")

        # We make some final cuts based on UPRNs that at a later stage are found to be odd
        if portfolio_properties["UPRN"].isin(exclusion_uprns).sum():
            raise Exception("Implement me!")
            # Identify who the owners are for thes uprns
            # owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby(
            #     "Company Registration No. (1)"
            # )["UPRN"].nunique().reset_index().rename(
            #     columns={"UPRN": "number_of_properties_to_exclude"}
            # )
            #
            # min_owners_threshold = portfolio_owners["total_number_of_properties"].min()
            #
            # portfolio_owners = portfolio_owners.merge(
            #     owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded")
            # )

        # Step 2: Merge in the valuations data
        portfolio_properties = portfolio_properties.merge(
            valuations.rename(columns={"uprn": "UPRN"}).drop(columns=['address', 'postcode']), how="left", on="UPRN"
        )

        # Step 3: Store the final outputs
        save_excel_to_s3(
            df=portfolio_owners,
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_owners.xlsx",
        )

        save_excel_to_s3(
            df=portfolio_properties,
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_properties.xlsx",
        )

        save_excel_to_s3(
            df=portfolio_epc_data,
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_epc_data.xlsx",
        )