diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index a3aa9e15..5f506881 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -4,7 +4,7 @@ from tqdm import tqdm import pandas as pd import Levenshtein import re -from utils.s3 import save_excel_to_s3 +from utils.s3 import save_excel_to_s3, read_excel_from_s3 from utils.logger import setup_logger from backend.SearchEpc import SearchEpc from etl.spatial.OpenUprnClient import OpenUprnClient @@ -948,7 +948,7 @@ class Ownership: matched_addresses_final = self.matched_addresses[ ~self.matched_addresses["sold_recently"] & ~self.matched_addresses["sale_lodged_recently"] - ] + ].copy() logger.info("Performing conservation area and listed/herigage building filtering") @@ -973,7 +973,7 @@ class Ownership: # Filter combined_matching_lookup accordingly combined_matching_lookup_final = self.combined_matching_lookup[ - self.combined_matching_lookup["UPRN"].isin(self.combined_matching_lookup["UPRN"]) + self.combined_matching_lookup["UPRN"].isin(matched_addresses_final["UPRN"]) ] # Roll up portfolio @@ -991,8 +991,16 @@ class Ownership: ) ] + # We perform some checks + if self.portfolio_owners["total_number_of_properties"].sum() != self.portfolio_properties["UPRN"].nunique(): + raise ValueError("Portfolio owners and properties don't match") + self.portfolio_epc_data = self.epc_data[self.epc_data["UPRN"].isin(self.portfolio_properties["UPRN"])] + # Additional checks + if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique(): + raise ValueError("Portfolio properties and epc data don't match") + logger.info("Storing final outpus") # Store data save_excel_to_s3( @@ -1028,3 +1036,49 @@ class Ownership: ) return asset_list + + def create_final_outputs(self, portfolio_timestamp): + """ + Given the completed outputs of the matching process, this function creates the final outputs, after matching + valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means + that we can iterate on the portfolio without affecting the final outputs, and then once we're happy with the + new version, we can commit those files to the "working" directory. This inforamtion shouldn't update very + often and so we're ok to store this at a daily level + :return: + """ + + # Step 1: Read in the valuations data + valuations = read_excel_from_s3( + bucket_name=self.bucket, + file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx", + header_row=0 + ) + + # Load in the portfolio data + # 1) owners + portfolio_owners = read_excel_from_s3( + bucket_name=self.bucket, + file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_owners.xlsx", + header_row=0 + ) + # 2) EPC + portfolio_epc_data = read_excel_from_s3( + bucket_name=self.bucket, + file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_epc_data.xlsx", + header_row=0 + ) + + # 3) properties + portfolio_properties = read_excel_from_s3( + bucket_name=self.bucket, + file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_properties.xlsx", + header_row=0 + ) + + portfolio_epc_data["UPRN"].duplicated().sum() + portfolio_properties["UPRN"].duplicated().sum() + portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))] + + portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))] + + portfolio_epc_data.shape diff --git a/etl/ownership/projects/midlands_portfolio/app.py b/etl/ownership/projects/midlands_portfolio/app.py index 99b8fc48..ae7822a6 100644 --- a/etl/ownership/projects/midlands_portfolio/app.py +++ b/etl/ownership/projects/midlands_portfolio/app.py @@ -162,3 +162,6 @@ def app(): "budget": None, } print(body) + + # We now need a distinct step to prepare final outputs + portfolio_timestamp = "2024-08-20 15:51:10.292075"