diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 7403c45c..0bbb4689 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -81,6 +81,9 @@ class Ownership: # Data storage paths self.epc_data_filepath = f"ownership/{project_name}/{self.run_timestamp}/epc_data.xlsx" + self.filtered_land_registry_filepath = ( + f"ownership/{project_name}/{self.run_timestamp}/filtered_land_registry.xlsx" + ) # Data self.epc_data = None @@ -567,12 +570,19 @@ class Ownership: logger.info("Reading land registry data") self.land_registry = self.get_land_registry() + # Store this fitereed version in s3 + save_excel_to_s3( + df=self.land_registry, + bucket_name="epc_data", + file_key=self.filtered_land_registry_filepath, + ) for col in ["postcode", "street", "paon", "saon"]: self.land_registry[col] = self.land_registry[col].str.lower().str.strip() self.land_registry["date_of_transfer"] = pd.to_datetime(self.land_registry["date_of_transfer"]) + logger.info("Performing land registry matching") land_registry_matches = [] for _, match in tqdm(self.matched_addresses.iterrows(), total=len(self.matched_addresses)): # Filter land registry on the postcode