added step to remove owners with just 1 property

2026-08-02 21:08:24 +00:00 · 2024-08-20 19:27:34 +01:00 · 2024-08-20 19:27:34 +01:00 · 520aa430b7
commit 520aa430b7
parent a153de51c3
3 changed files with 36 additions and 13 deletions
--- a/etl/ownership/Ownership.py
+++ b/etl/ownership/Ownership.py
@ -933,6 +933,7 @@ class Ownership:
        )

        pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False)
+        pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1]

        pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"]
        pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum()
@ -1037,7 +1038,7 @@ class Ownership:

        return asset_list

-    def create_final_outputs(self, portfolio_timestamp):
+    def create_final_outputs(self, portfolio_timestamp, exclusion_uprns=None):
        """
        Given the completed outputs of the matching process, this function creates the final outputs, after matching
        valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means
@ -1047,8 +1048,10 @@ class Ownership:
        :return:
        """

+        exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns
+
        # Step 1: Read in the valuations data
-        valuations = read_excel_from_s3(
+        valuatio_ns = read_excel_from_s3(
            bucket_name=self.bucket,
            file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx",
            header_row=0
@ -1075,10 +1078,24 @@ class Ownership:
            header_row=0
        )

-        portfolio_epc_data["UPRN"].duplicated().sum()
-        portfolio_properties["UPRN"].duplicated().sum()
-        portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
+        # Check they're the right size
+        if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique():
+            raise ValueError("Portfolio owners and properties don't match")

-        portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
+        if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique():
+            raise ValueError("Portfolio properties and epc data don't match")

-        portfolio_epc_data.shape
+        # We make some final cuts based on UPRNs that at a later stage are found to be odd
+        if portfolio_properties["UPRN"].isin(exclusion_uprns).sum():
+            # Identify who the owners are for thes uprns
+            owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby(
+                "Company Registration No. (1)"
+            )["UPRN"].nunique().reset_index().rename(
+                columns={"UPRN": "number_of_properties_to_exclude"}
+            )
+
+            min_owners_threshold = portfolio_owners["total_number_of_properties"].min()
+
+            portfolio_owners = portfolio_owners.merge(
+                owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded")
+            )
--- a/etl/ownership/config.py
+++ b/etl/ownership/config.py
@ -28,4 +28,8 @@ EXCLUDED_UPRNS = [
    100031592801,
    # Can't find reliable information to this property on zoopla/rightmove
    100031579087,
+    # Can't find reliable information to this property on zoopla/rightmove
+    200000877273,
+    # Can't find reliable information to this property on zoopla/rightmove - seems like a post office!
+    100071391639
 ]
--- a/etl/ownership/projects/midlands_portfolio/app.py
+++ b/etl/ownership/projects/midlands_portfolio/app.py
@ -163,11 +163,13 @@ def app():
    }
    print(body)

-    # We read in the current valuation data and identify if there are any uprns that need to be added
-    previous_valuations = pd.read_excel(
-        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx")
-    missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))]
-    missed.to_csv("missed_valuations.csv")
+    # # We read in the current valuation data and identify if there are any uprns that need to be added
+    # previous_valuations = pd.read_excel(
+    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx")
+    # missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))]
+    # missed.to_csv("missed_valuations.csv")

    # We now need a distinct step to prepare final outputs
-    portfolio_timestamp = "2024-08-20 15:51:10.292075"
+    portfolio_timestamp = "2024-08-20 18:53:08.326351"
+
+    exclusion_uprns = EXCLUDED_UPRNS