From 520aa430b77462666ac2ca9405a5d7349172224f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 20 Aug 2024 19:27:34 +0100
Subject: [PATCH] added step to remove owners with just 1 property

---
 etl/ownership/Ownership.py                    | 31 ++++++++++++++-----
 etl/ownership/config.py                       |  4 +++
 .../projects/midlands_portfolio/app.py        | 14 +++++----
 3 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py
index 5f506881..5b421e7b 100644
--- a/etl/ownership/Ownership.py
+++ b/etl/ownership/Ownership.py
@@ -933,6 +933,7 @@ class Ownership:
         )
 
         pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False)
+        pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1]
 
         pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"]
         pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum()
@@ -1037,7 +1038,7 @@ class Ownership:
 
         return asset_list
 
-    def create_final_outputs(self, portfolio_timestamp):
+    def create_final_outputs(self, portfolio_timestamp, exclusion_uprns=None):
         """
         Given the completed outputs of the matching process, this function creates the final outputs, after matching
         valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means
@@ -1047,8 +1048,10 @@ class Ownership:
         :return:
         """
 
+        exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns
+
         # Step 1: Read in the valuations data
-        valuations = read_excel_from_s3(
+        valuatio_ns = read_excel_from_s3(
             bucket_name=self.bucket,
             file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx",
             header_row=0
@@ -1075,10 +1078,24 @@ class Ownership:
             header_row=0
         )
 
-        portfolio_epc_data["UPRN"].duplicated().sum()
-        portfolio_properties["UPRN"].duplicated().sum()
-        portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
+        # Check they're the right size
+        if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique():
+            raise ValueError("Portfolio owners and properties don't match")
 
-        portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
+        if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique():
+            raise ValueError("Portfolio properties and epc data don't match")
 
-        portfolio_epc_data.shape
+        # We make some final cuts based on UPRNs that at a later stage are found to be odd
+        if portfolio_properties["UPRN"].isin(exclusion_uprns).sum():
+            # Identify who the owners are for thes uprns
+            owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby(
+                "Company Registration No. (1)"
+            )["UPRN"].nunique().reset_index().rename(
+                columns={"UPRN": "number_of_properties_to_exclude"}
+            )
+
+            min_owners_threshold = portfolio_owners["total_number_of_properties"].min()
+
+            portfolio_owners = portfolio_owners.merge(
+                owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded")
+            )
diff --git a/etl/ownership/config.py b/etl/ownership/config.py
index 1b67e742..3f153817 100644
--- a/etl/ownership/config.py
+++ b/etl/ownership/config.py
@@ -28,4 +28,8 @@ EXCLUDED_UPRNS = [
     100031592801,
     # Can't find reliable information to this property on zoopla/rightmove
     100031579087,
+    # Can't find reliable information to this property on zoopla/rightmove
+    200000877273,
+    # Can't find reliable information to this property on zoopla/rightmove - seems like a post office!
+    100071391639
 ]
diff --git a/etl/ownership/projects/midlands_portfolio/app.py b/etl/ownership/projects/midlands_portfolio/app.py
index e79d86d2..8a2abe48 100644
--- a/etl/ownership/projects/midlands_portfolio/app.py
+++ b/etl/ownership/projects/midlands_portfolio/app.py
@@ -163,11 +163,13 @@ def app():
     }
     print(body)
 
-    # We read in the current valuation data and identify if there are any uprns that need to be added
-    previous_valuations = pd.read_excel(
-        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx")
-    missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))]
-    missed.to_csv("missed_valuations.csv")
+    # # We read in the current valuation data and identify if there are any uprns that need to be added
+    # previous_valuations = pd.read_excel(
+    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx")
+    # missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))]
+    # missed.to_csv("missed_valuations.csv")
 
     # We now need a distinct step to prepare final outputs
-    portfolio_timestamp = "2024-08-20 15:51:10.292075"
+    portfolio_timestamp = "2024-08-20 18:53:08.326351"
+
+    exclusion_uprns = EXCLUDED_UPRNS