debugging ownership class

This commit is contained in:
Khalim Conn-Kowlessar 2024-08-20 17:43:00 +01:00
parent 9938dea190
commit 41c38e622d
2 changed files with 60 additions and 3 deletions

View file

@ -4,7 +4,7 @@ from tqdm import tqdm
import pandas as pd
import Levenshtein
import re
from utils.s3 import save_excel_to_s3
from utils.s3 import save_excel_to_s3, read_excel_from_s3
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc
from etl.spatial.OpenUprnClient import OpenUprnClient
@ -948,7 +948,7 @@ class Ownership:
matched_addresses_final = self.matched_addresses[
~self.matched_addresses["sold_recently"] &
~self.matched_addresses["sale_lodged_recently"]
]
].copy()
logger.info("Performing conservation area and listed/herigage building filtering")
@ -973,7 +973,7 @@ class Ownership:
# Filter combined_matching_lookup accordingly
combined_matching_lookup_final = self.combined_matching_lookup[
self.combined_matching_lookup["UPRN"].isin(self.combined_matching_lookup["UPRN"])
self.combined_matching_lookup["UPRN"].isin(matched_addresses_final["UPRN"])
]
# Roll up portfolio
@ -991,8 +991,16 @@ class Ownership:
)
]
# We perform some checks
if self.portfolio_owners["total_number_of_properties"].sum() != self.portfolio_properties["UPRN"].nunique():
raise ValueError("Portfolio owners and properties don't match")
self.portfolio_epc_data = self.epc_data[self.epc_data["UPRN"].isin(self.portfolio_properties["UPRN"])]
# Additional checks
if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique():
raise ValueError("Portfolio properties and epc data don't match")
logger.info("Storing final outpus")
# Store data
save_excel_to_s3(
@ -1028,3 +1036,49 @@ class Ownership:
)
return asset_list
def create_final_outputs(self, portfolio_timestamp):
"""
Given the completed outputs of the matching process, this function creates the final outputs, after matching
valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means
that we can iterate on the portfolio without affecting the final outputs, and then once we're happy with the
new version, we can commit those files to the "working" directory. This inforamtion shouldn't update very
often and so we're ok to store this at a daily level
:return:
"""
# Step 1: Read in the valuations data
valuations = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx",
header_row=0
)
# Load in the portfolio data
# 1) owners
portfolio_owners = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_owners.xlsx",
header_row=0
)
# 2) EPC
portfolio_epc_data = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_epc_data.xlsx",
header_row=0
)
# 3) properties
portfolio_properties = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_properties.xlsx",
header_row=0
)
portfolio_epc_data["UPRN"].duplicated().sum()
portfolio_properties["UPRN"].duplicated().sum()
portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
portfolio_properties[~portfolio_properties["UPRN"].astype(str).isin(portfolio_epc_data["UPRN"].astype(str))]
portfolio_epc_data.shape

View file

@ -162,3 +162,6 @@ def app():
"budget": None,
}
print(body)
# We now need a distinct step to prepare final outputs
portfolio_timestamp = "2024-08-20 15:51:10.292075"