added step to remove owners with just 1 property

This commit is contained in:
Khalim Conn-Kowlessar 2024-08-20 19:27:34 +01:00
parent a153de51c3
commit 520aa430b7
3 changed files with 36 additions and 13 deletions

View file

@ -933,6 +933,7 @@ class Ownership:
)
pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False)
pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1]
pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"]
pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum()
@ -1037,7 +1038,7 @@ class Ownership:
return asset_list
def create_final_outputs(self, portfolio_timestamp):
def create_final_outputs(self, portfolio_timestamp, exclusion_uprns=None):
"""
Given the completed outputs of the matching process, this function creates the final outputs, after matching
valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means
@ -1047,8 +1048,10 @@ class Ownership:
:return:
"""
exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns
# Step 1: Read in the valuations data
valuations = read_excel_from_s3(
valuatio_ns = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx",
header_row=0
@ -1075,10 +1078,24 @@ class Ownership:
header_row=0
)
portfolio_epc_data["UPRN"].duplicated().sum()
portfolio_properties["UPRN"].duplicated().sum()
portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
# Check they're the right size
if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique():
raise ValueError("Portfolio owners and properties don't match")
portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique():
raise ValueError("Portfolio properties and epc data don't match")
portfolio_epc_data.shape
# We make some final cuts based on UPRNs that at a later stage are found to be odd
if portfolio_properties["UPRN"].isin(exclusion_uprns).sum():
# Identify who the owners are for thes uprns
owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby(
"Company Registration No. (1)"
)["UPRN"].nunique().reset_index().rename(
columns={"UPRN": "number_of_properties_to_exclude"}
)
min_owners_threshold = portfolio_owners["total_number_of_properties"].min()
portfolio_owners = portfolio_owners.merge(
owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded")
)

View file

@ -28,4 +28,8 @@ EXCLUDED_UPRNS = [
100031592801,
# Can't find reliable information to this property on zoopla/rightmove
100031579087,
# Can't find reliable information to this property on zoopla/rightmove
200000877273,
# Can't find reliable information to this property on zoopla/rightmove - seems like a post office!
100071391639
]

View file

@ -163,11 +163,13 @@ def app():
}
print(body)
# We read in the current valuation data and identify if there are any uprns that need to be added
previous_valuations = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx")
missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))]
missed.to_csv("missed_valuations.csv")
# # We read in the current valuation data and identify if there are any uprns that need to be added
# previous_valuations = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/sfr property valuations.xlsx")
# missed = asset_list[~asset_list["uprn"].astype(str).isin(previous_valuations["uprn"].astype(str))]
# missed.to_csv("missed_valuations.csv")
# We now need a distinct step to prepare final outputs
portfolio_timestamp = "2024-08-20 15:51:10.292075"
portfolio_timestamp = "2024-08-20 18:53:08.326351"
exclusion_uprns = EXCLUDED_UPRNS