From 71310526ef2f9c310e336652e8528578ea17aef4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 5 Feb 2026 14:07:43 +0000 Subject: [PATCH] ready for review (not deployed --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/onboarders/base.py | 31 +++++++++++++++--------- backend/onboarders/handler.py | 5 ++-- backend/onboarders/parity.py | 37 ++++++++++++++++++++--------- backend/onboarders/requirements.txt | 6 +++++ 6 files changed, 57 insertions(+), 26 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index c6561970..a079cfaf 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..b6084632 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/onboarders/base.py b/backend/onboarders/base.py index 4d09cfeb..93a0b7b0 100644 --- a/backend/onboarders/base.py +++ b/backend/onboarders/base.py @@ -5,15 +5,18 @@ from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3 class OnboarderBase: # Input dataset to be transformed data: pd.DataFrame | None = None + bucket_name = None + input_file_name = None + output_file_name = None # Description columns landlord_wall_construction: str = "landlord_wall_construction" landlord_roof_construction: str = "landlord_roof_construction" landlord_floor_construction: str = "landlord_floor_construction" - landlord_windows_construction: str = "landlord_windows_construction" + landlord_windows_type: str = "landlord_windows_type" landlord_heating_construction: str = "landlord_heating_construction" - landlord_fuel_construction: str = "landlord_fuel_construction" - landlord_heating_controls_construction: str = "landlord_heating_controls_construction" - landlord_hot_water_system_construction: str = "landlord_hot_water_system_construction" + landlord_fuel_type: str = "landlord_fuel_type" + landlord_heating_controls: str = "landlord_heating_controls" + landlord_hot_water_system: str = "landlord_hot_water_system" # Efficiency columns landlord_roof_efficiency: str = "landlord_roof_efficiency" @@ -37,22 +40,28 @@ class OnboarderBase: landlord_property_type: str = "landlord_property_type" landlord_built_form: str = "landlord_built_form" - def read_s3(self, bucket_name: str, file_name: str, **kwargs): - if kwargs.get("format") == "xlsx": + def read_s3(self, file_format, **kwargs): + + if self.input_file_name is None or self.bucket_name is None: + raise ValueError("Bucket name and input file name must be set before reading from S3.") + if file_format == "xlsx": self.data = read_excel_from_s3( - bucket_name=bucket_name, - file_key=file_name, + bucket_name=self.bucket_name, + file_key=self.input_file_name, sheet_name=kwargs.get("sheet_name"), header_row=kwargs.get("header_row", 0) ) else: - self.data = read_from_s3(bucket_name=bucket_name, s3_file_name=file_name) + self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name) - def write(self, bucket_name: str, file_name: str): + def write(self): if self.data is None: raise ValueError("No data to write. Please run transform() before writing.") + + if self.bucket_name is None or self.output_file_name is None: + raise ValueError("Bucket name and output file name must be set before writing to S3.") # Store file as csv - will store in the same route location as the input file - save_csv_to_s3(dataframe=self.data, bucket_name=bucket_name, file_name=file_name) + save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name) @staticmethod def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool: diff --git a/backend/onboarders/handler.py b/backend/onboarders/handler.py index dfff7788..be7c2f38 100644 --- a/backend/onboarders/handler.py +++ b/backend/onboarders/handler.py @@ -36,12 +36,13 @@ def handler(event, context): onboarder = Onboarder( fileuri=validated_event.s3_uri, format=validated_event.format, - sheet_name=validated_event.sheet_name + sheet_name=validated_event.sheet_name, + file_format=validated_event.format ) logger.info("Transforming data") onboarder.transform() - logger.info("Writing data") + logger.info(f"Writing data to {onboarder.output_file_name}, bucket: {onboarder.bucket_name}") onboarder.write() except Exception as e: logger.error(f"Failed to process record: {e}") diff --git a/backend/onboarders/parity.py b/backend/onboarders/parity.py index 2afc7a73..6c79d027 100644 --- a/backend/onboarders/parity.py +++ b/backend/onboarders/parity.py @@ -30,13 +30,16 @@ class ParityOnboarder(OnboarderBase): def __init__( self, fileuri: str, + file_format: str, **kwargs ): # Extract bucket, and filekey; Will be in the format s3://bucket/key - bucket_name = fileuri.split("/")[2] - file_name = "/".join(fileuri.split("/")[3:]) + self.bucket_name = fileuri.split("/")[2] + self.input_file_name = "/".join(fileuri.split("/")[3:]) + # Also prepare output file name + self.output_file_name = self.input_file_name.replace("." + file_format, "") + "_transformed.csv" - self.read_s3(bucket_name=bucket_name, file_name=file_name, **kwargs) + self.read_s3(file_format=file_format, **kwargs) pass def map_construction_age_band(self): @@ -242,7 +245,7 @@ class ParityOnboarder(OnboarderBase): # however self.data[ [ - self.landlord_windows_construction, + self.landlord_windows_type, self.landlord_windows_efficiency, self.landlord_multi_glaze_proportion, self.landlord_glazed_type, @@ -261,10 +264,10 @@ class ParityOnboarder(OnboarderBase): [ self.landlord_heating_construction, self.landlord_heating_efficiency, - self.landlord_fuel_construction, - self.landlord_heating_controls_construction, + self.landlord_fuel_type, + self.landlord_heating_controls, self.landlord_heating_controls_efficiency, - self.landlord_hot_water_system_construction, + self.landlord_hot_water_system, self.landlord_hot_water_efficiency ] ] = self.data[ @@ -301,17 +304,17 @@ class ParityOnboarder(OnboarderBase): self.landlord_roof_efficiency, self.landlord_has_sloping_ceiling, self.landlord_floor_construction, - self.landlord_windows_construction, + self.landlord_windows_type, self.landlord_windows_efficiency, self.landlord_multi_glaze_proportion, self.landlord_glazed_type, self.landlord_glazed_area, self.landlord_heating_construction, self.landlord_heating_efficiency, - self.landlord_fuel_construction, - self.landlord_heating_controls_construction, + self.landlord_fuel_type, + self.landlord_heating_controls, self.landlord_heating_controls_efficiency, - self.landlord_hot_water_system_construction, + self.landlord_hot_water_system, self.landlord_hot_water_efficiency ] ].rename( @@ -324,6 +327,17 @@ class ParityOnboarder(OnboarderBase): } ) + def extract_values(self): + for columns in [ + self.landlord_construction_age_band, self.landlord_property_type, self.landlord_built_form, + self.landlord_wall_construction, self.landlord_wall_efficiency, self.landlord_roof_construction, + self.landlord_roof_efficiency, self.landlord_floor_construction, self.landlord_windows_type, + self.landlord_windows_efficiency, self.landlord_heating_construction, self.landlord_heating_efficiency, + self.landlord_fuel_type, self.landlord_heating_controls, self.landlord_heating_controls_efficiency, + self.landlord_hot_water_system, self.landlord_hot_water_efficiency + ]: + self.data[columns] = self.data[columns].progress_apply(lambda x: x.value if hasattr(x, "value") else x) + def transform(self): # ------------ construction_age_band ------------ self.map_construction_age_band() @@ -354,3 +368,4 @@ class ParityOnboarder(OnboarderBase): # ------------ Formating ------------ self.select_columns() + self.extract_values() diff --git a/backend/onboarders/requirements.txt b/backend/onboarders/requirements.txt index e69de29b..907cb877 100644 --- a/backend/onboarders/requirements.txt +++ b/backend/onboarders/requirements.txt @@ -0,0 +1,6 @@ +boto3 +numpy==2.1.2 +pandas==2.2.3 +tqdm==4.66.5 +pydantic==2.9.2 +openpyxl==3.1.2 \ No newline at end of file