From 39bc6c53b867c66601042c313649f912adaae8d7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 1 Aug 2024 12:21:10 +0100
Subject: [PATCH] sfr investiation in progress

---
 etl/customers/goldman/property_ownership.py | 254 +++++++++++++++++++-
 1 file changed, 248 insertions(+), 6 deletions(-)

diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py
index ebd72732..c1f37d4c 100644
--- a/etl/customers/goldman/property_ownership.py
+++ b/etl/customers/goldman/property_ownership.py
@@ -75,10 +75,15 @@ def find_f_g_properties(paths):
         epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
         epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str)
 
-        # Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this
-        epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], format='mixed', errors="coerce")
+        if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum():
+            raise Exception("wtf")
 
-        epc_data = epc_data.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
+        # Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this
+        epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")
+
+        epc_data = epc_data.sort_values(
+            ["LODGEMENT_DATE", "LODGEMENT_DATETIME"], ascending=False
+        ).drop_duplicates("UPRN")
 
         # Get G & F properties
         epc_data = epc_data[epc_data["CURRENT_ENERGY_RATING"].isin(["G", "F"])]
@@ -401,6 +406,8 @@ def app():
             ~company_ownership["Property Address"].str.lower().str.startswith(starting_term)
         ]
 
+    # address = properties[properties["UPRN"] == 100030253055].squeeze()
+
     freehold_matching_lookup = []  # 634
     leasehold_matching_lookup = []  # 86
     shared_leasehold_match = []
@@ -493,12 +500,18 @@ def app():
     # freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx")
     # leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx")
 
+    # freehold_matching_lookup.shape
+    # (1537, 4)
+    # leasehold_matching_lookup.shape
+    # (390, 4)
+
     # The approximate matches aren't very good
     freehold_matching_lookup = freehold_matching_lookup[freehold_matching_lookup["match_type"] == "exact"]
     leasehold_matching_lookup = leasehold_matching_lookup[leasehold_matching_lookup["match_type"] == "exact"]
 
     # Combine
     combined_matching_lookup = pd.concat([freehold_matching_lookup, leasehold_matching_lookup])
+
     # Remove duplicates
     combined_matching_lookup = remove_duplicate_matches(
         matching_lookup=combined_matching_lookup, properties=properties, company_ownership=company_ownership
@@ -566,7 +579,6 @@ def app():
 
     land_registry_matches = []
     for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)):
-
         # Filter land registry on the postcode
         lr_filtered = land_registry[
             (land_registry["postcode"] == match["epc_postcode"].lower().strip())
@@ -782,7 +794,7 @@ def app():
         right_on="uprn"
     ).drop(columns=["uprn"])
 
-    # Flat anything that sold in the last year
+    # Flag anything that sold in the last year
     matched_addresses["sold_recently"] = (
         matched_addresses["date_of_transfer"] >= pd.Timestamp.now() - pd.DateOffset(years=1)
     )
@@ -792,6 +804,9 @@ def app():
         (matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"]))
     )
 
+    # Save this
+    # matched_addresses.to_excel("combined_aggregate - pre filter 28th July.xlsx", index=False)
+
     # Drop rows on the booleans
     matched_addresses = matched_addresses[
         ~matched_addresses["sold_recently"] &
@@ -835,7 +850,7 @@ def app():
     # investment_50m_properties.to_excel("investment_50m_properties 28th July.xlsx", index=False)
 
     # Store the EPC data
-    # portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 29th July.xlsx", index=False)
+    # portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 28th July.xlsx", index=False)
 
     # We check if any of these properties are in a conservation area
     valuations = pd.read_excel("property value.xlsx")
@@ -997,3 +1012,230 @@ def prepare_anonymised_data():
     )
 
     df.to_excel("Property List - 50% redacted.xlsx", index=False)
+
+
+def adhoc_change_of_portfolio_analysis_july_2024():
+    """
+    This is just some adhoc analysis, which answers some questions which arose upon refreshing the SFR portfolio
+    in late July 2024
+    :return:
+    """
+
+    # Question 1: Which properties in the previous portfolio were in conservation areas or had listed/heritage status?
+    def answer_q1():
+        # Data was just stored here:
+        geospatial_data = pd.read_excel("geospatial_data.xlsx")
+
+        special_buildings = geospatial_data[
+            (geospatial_data["conservation_status"] == 1) |
+            geospatial_data["is_listed_building"] |
+            geospatial_data["is_heritage_building"]
+            ]
+
+        print(
+            f"There were {special_buildings.shape[0]} properties in the previous portfolio which were in conservation "
+            f"areas or had listed/heritage status"
+        )
+        print(f"{(special_buildings['conservation_status'] == 1).sum()} were in a conservation area")
+        print(f"{special_buildings['is_listed_building'].sum()} were listed buildings")
+        print(f"{special_buildings['is_heritage_building'].sum()} were heritage buildings")
+
+    answer_q1()
+
+    # Question 2: For each property in the old portfolio, why was it lost?
+    def answer_q2():
+        # We read in the previous 50m portfolio
+        previous_portfolio = pd.read_excel("investment_50m_properties 28th May.xlsx")  # 39 owners
+
+        new_matched_addresses = pd.read_excel("combined_aggregate - pre filter 28th July.xlsx")
+        new_portfolio = pd.read_excel("investment_50m_properties 28th July.xlsx")  # 69 owners
+
+        # dropped units
+        dropped_units = previous_portfolio[
+            ~previous_portfolio["UPRN"].isin(new_portfolio["UPRN"].values)
+        ]
+        # Lots of properties are missed out - why
+        # 1) What was dropped, but was in the matched addresses and therefore was maybe filtered out
+        dropped_units_matched = dropped_units[
+            dropped_units["UPRN"].isin(new_matched_addresses["UPRN"])
+        ].copy()
+
+        dropped_units_matched = dropped_units_matched.merge(
+            new_matched_addresses[
+                ["UPRN", 'transaction_id', 'price', 'date_of_transfer', 'sold_recently', 'sale_lodged_recently']
+            ],
+            how="left", on="UPRN"
+        )
+
+        # 97 units here - how mant were sold
+        of_which_sold = dropped_units_matched[
+            dropped_units_matched["sold_recently"]
+        ]
+        n_sold = of_which_sold.shape[0]
+        print(f"{n_sold} sold recently ({n_sold / previous_portfolio.shape[0] * 100})%")
+
+        of_which_have_sale_epc_but_not_sold = dropped_units_matched[
+            ~dropped_units_matched["sold_recently"] & dropped_units_matched["sale_lodged_recently"]
+            ]
+        n_with_sale_epc_but_not_yet_sold = of_which_have_sale_epc_but_not_sold.shape[0]
+        print(
+            f"{n_with_sale_epc_but_not_yet_sold} have a sale EPC but have not sold yet ("
+            f"{n_with_sale_epc_but_not_yet_sold / previous_portfolio.shape[0] * 100})%"
+        )
+
+        # What about things that haven't sold or don't look likely to sell
+        not_sold = dropped_units_matched[
+            ~dropped_units_matched["sold_recently"] & ~dropped_units_matched["sale_lodged_recently"]
+            ]
+
+        new_owner_sizes = new_portfolio.groupby(
+            ["Company Registration No. (1)"]
+        ).size().reset_index().rename(columns={0: "Number of Properties"})
+        new_owner_sizes = new_owner_sizes.sort_values("Number of Properties", ascending=False)
+
+        previous_owner_sizes = previous_portfolio.groupby(
+            ["Company Registration No. (1)"]
+        ).size().reset_index().rename(columns={0: "Number of Properties"})
+        previous_owner_sizes = previous_owner_sizes.sort_values("Number of Properties", ascending=False)
+
+        # Let's just confirm that we took in a bigger owner, as we see this unit was still matched
+        owner_too_small = []
+        owner_big_enough = []
+        for _, property in not_sold.iterrows():
+            owner_reg_id = property["Company Registration No. (1)"]
+            old_portfolio_owner_size = previous_owner_sizes[
+                previous_owner_sizes["Company Registration No. (1)"] == owner_reg_id
+                ]
+            # We make sure that the number of properties is smaller than the new smallest number
+            if (
+                old_portfolio_owner_size["Number of Properties"].values[0] >
+                new_owner_sizes["Number of Properties"].min()
+            ):
+                owner_big_enough.append(property.to_dict())
+                continue
+
+            owner_too_small.append(property.to_dict())
+
+        n_owner_too_small = len(owner_too_small)
+        owner_big_enough = pd.DataFrame(owner_big_enough)
+
+        summary = []
+        for _, record in owner_big_enough.iterrows():
+            # Do we have this new owner?
+            new_owner = new_portfolio[
+                new_portfolio["Company Registration No. (1)"] == record["Company Registration No. (1)"]
+                ]
+            if new_owner.empty:
+                # Why don't we have this new owner
+                new_owner_data = new_matched_addresses[
+                    new_matched_addresses["Company Registration No. (1)"] == record["Company Registration No. (1)"]
+                    ]
+
+                new_owner_data_filtered = new_owner_data[
+                    ~new_owner_data["sold_recently"] & ~new_owner_data["sale_lodged_recently"]
+                    ]
+
+                summary.append(
+                    {
+                        "Owner Name": record["Proprietor Name (1)"],
+                        "Owner reg id": record["Company Registration No. (1)"],
+                        "N properties in new portfolio before filtering": new_owner_data.shape[0],
+                        "N properties in new portfolio after filtering": new_owner_data_filtered.shape[0],
+                    }
+
+                )
+                continue
+            raise Exception("something went wrong")
+
+        summary = pd.DataFrame(summary)
+
+        not_accounted_for = summary[
+            (
+                summary["N properties in new portfolio before filtering"] <
+                previous_owner_sizes["Number of Properties"].min()
+            )
+        ]
+
+        # We have two owners not accounted for:
+        # ALLMID LIMITED, 01959058
+        # CORAL RACING LIMITED, 541600
+        # What happened to these owners?
+        new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx")
+        allmid = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "01959058"].copy()
+        # Check if any of the properties are not in the new EPC data
+        allmid["not_in_new_epc"] = ~allmid["UPRN"].isin(new_epc["UPRN"])
+        allmid["not_in_matched_pre_filtered"] = ~allmid["UPRN"].isin(new_matched_addresses["UPRN"])
+        # In the previous portfolio, Allmid had 4 properties and in the re-build, it has just 2. Why?
+        # Firstly, one of their properties was re-surveyed not at an F/G
+        # Secondly, one of their properties is no longer owned by them: 
+        # https://www.zoopla.co.uk/property/uprn/100070553074/
+        # So as an owner, they fell out of the ranking
+        coral_racing = previous_portfolio[previous_portfolio["Company Registration No. (1)"] == "541600"].copy()
+        coral_racing["not_in_new_epc"] = ~coral_racing["UPRN"].isin(new_epc["UPRN"])
+        coral_racing["not_in_matched_pre_filtered"] = ~coral_racing["UPRN"].isin(new_matched_addresses["UPRN"])
+        # Coral goes down from 4 -> 1 on refresh, so what happened?
+        # 1) 2 properties had new EPCs and re-scored higher
+        # 2) 1 property, 85A Market Street, Church Gresley, Swadlincote, DE11 9PN is no longer matched to the ownership
+        #    data, which is correct
+
+        # Why were these units lost?
+        # There's just 1 owner, who is BARHAM PROPERTY LTD
+        owner_too_big_ids = owner_big_enough["Company Registration No. (1)"].unique()
+        owner_too_big_names = owner_big_enough["Proprietor Name (1)"].unique()
+        previous_owner_size = previous_owner_sizes[
+            previous_owner_sizes["Company Registration No. (1)"].isin(owner_too_big_ids)
+        ]
+        new_owner_size = new_matched_addresses[
+            new_matched_addresses["Company Registration No. (1)"].isin(owner_too_big_ids) |
+            new_matched_addresses["Proprietor Name (1)"].isin(owner_too_big_names)
+            ]
+
+        n_unsold = new_owner_size[~new_owner_size["sold_recently"] & ~new_owner_size["sale_lodged_recently"]].shape
+
+        # Happy with the justification to this point
+        assert (
+            (n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) ==
+            dropped_units_matched.shape[0]
+        )
+
+        # We now have a list of properties that were lost from the previous iteration to the next that were not matched
+        dropped_units_unmatched = dropped_units[
+            ~dropped_units["UPRN"].isin(new_matched_addresses["UPRN"])
+        ].copy()
+
+        # A few possibilities: They aren't in the EPC data?
+        new_epc = pd.read_excel("EPC F & G Properties - V2.xlsx")
+        unmatched_not_in_epc = dropped_units_unmatched[
+            ~dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"])
+        ]
+        # There are 17 units that have had new EPCs above a G
+        # Who were the owners? - various, nothing particularly remarkable
+        (
+            previous_portfolio[
+                previous_portfolio["UPRN"].isin(unmatched_not_in_epc["UPRN"])
+            ]["Proprietor Name (1)"].value_counts()
+        )
+
+        # 22 final units to be accounted for...!
+        unmatched_in_epc = dropped_units_unmatched[
+            dropped_units_unmatched["UPRN"].isin(new_epc["UPRN"])
+        ]
+
+        # Some of them will be due to ownership
+        # TODO: Read in freehold/leashold data and see how many of these were non-exact matches!
+        leasehold_matching_lookup = pd.read_excel("leasehold_matching_lookup V2.xlsx")
+        freehold_matching_lookup = pd.read_excel("freehold_matching_lookup V2.xlsx")
+        combined_matching_lookup = pd.concat([leasehold_matching_lookup, freehold_matching_lookup])
+        # THis is 13 matches, all of them approximate
+        weak_matches = unmatched_in_epc.merge(combined_matching_lookup, how="inner", on="UPRN")
+
+        # These have been lost due to ownership updates. This has been checked manually for every unit and there has
+        # been sale activity for each one, justifying the change in ownership data
+        remaining_matches = unmatched_in_epc[
+            ~unmatched_in_epc["UPRN"].isin(weak_matches["UPRN"])
+        ]
+
+        assert dropped_units.shape[0] == (
+            (n_sold + n_with_sale_epc_but_not_yet_sold + n_owner_too_small + len(owner_big_enough)) + len(
+            weak_matches) + unmatched_not_in_epc.shape[0]
+        )