From 520aa430b77462666ac2ca9405a5d7349172224f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 20 Aug 2024 19:27:34 +0100 Subject: [PATCH] added step to remove owners with just 1 property --- etl/ownership/Ownership.py | 31 ++++++++++++++----- etl/ownership/config.py | 4 +++ .../projects/midlands_portfolio/app.py | 14 +++++---- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 5f506881..5b421e7b 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -933,6 +933,7 @@ class Ownership: ) pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False) + pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1] pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"] pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum() @@ -1037,7 +1038,7 @@ class Ownership: return asset_list - def create_final_outputs(self, portfolio_timestamp): + def create_final_outputs(self, portfolio_timestamp, exclusion_uprns=None): """ Given the completed outputs of the matching process, this function creates the final outputs, after matching valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means @@ -1047,8 +1048,10 @@ class Ownership: :return: """ + exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns + # Step 1: Read in the valuations data - valuations = read_excel_from_s3( + valuatio_ns = read_excel_from_s3( bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx", header_row=0 @@ -1075,10 +1078,24 @@ class Ownership: header_row=0 ) - portfolio_epc_data["UPRN"].duplicated().sum() - portfolio_properties["UPRN"].duplicated().sum() - portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))] + # Check they're the right size + if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique(): + raise ValueError("Portfolio owners and properties don't match") - portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))] + if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique(): + raise ValueError("Portfolio properties and epc data don't match") - portfolio_epc_data.shape + # We make some final cuts based on UPRNs that at a later stage are found to be odd + if portfolio_properties["UPRN"].isin(exclusion_uprns).sum(): + # Identify who the owners are for thes uprns + owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby( + "Company Registration No. (1)" + )["UPRN"].nunique().reset_index().rename( + columns={"UPRN": "number_of_properties_to_exclude"} + ) + + min_owners_threshold = portfolio_owners["total_number_of_properties"].min() + + portfolio_owners = portfolio_owners.merge( + owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded") + ) diff --git a/etl/ownership/config.py b/etl/ownership/config.py index 1b67e742..3f153817 100644 --- a/etl/ownership/config.py +++ b/etl/ownership/config.py @@ -28,4 +28,8 @@ EXCLUDED_UPRNS = [ 100031592801, # Can't find reliable information to this property on zoopla/rightmove 100031579087, + # Can't find reliable information to this property on zoopla/rightmove + 200000877273, + # Can't find reliable information to this property on zoopla/rightmove - seems like a post office! + 100071391639 ] diff --git a/etl/ownership/projects/midlands_portfolio/app.py b/etl/ownership/projects/midlands_portfolio/app.py index e79d86d2..8a2abe48 100644 --- a/etl/ownership/projects/midlands_portfolio/app.py +++ b/etl/ownership/projects/midlands_portfolio/app.py @@ -163,11 +163,13 @@ def app(): } print(body) - # We read in the current valuation data and identify if there are any uprns that need to be added - previous_valuations = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx") - missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))] - missed.to_csv("missed_valuations.csv") + # # We read in the current valuation data and identify if there are any uprns that need to be added + # previous_valuations = pd.read_excel( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx") + # missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))] + # missed.to_csv("missed_valuations.csv") # We now need a distinct step to prepare final outputs - portfolio_timestamp = "2024-08-20 15:51:10.292075" + portfolio_timestamp = "2024-08-20 18:53:08.326351" + + exclusion_uprns = EXCLUDED_UPRNS