import pandas as pd from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3 class OnboarderBase: # Input dataset to be transformed data: pd.DataFrame | None = None bucket_name = None input_file_name = None output_file_name = None # Description columns landlord_wall_construction: str = "landlord_wall_construction" landlord_roof_construction: str = "landlord_roof_construction" landlord_floor_construction: str = "landlord_floor_construction" landlord_windows_type: str = "landlord_windows_type" landlord_heating_system: str = "landlord_heating_system" landlord_fuel_type: str = "landlord_fuel_type" landlord_heating_controls: str = "landlord_heating_controls" landlord_hot_water_system: str = "landlord_hot_water_system" # Efficiency columns landlord_roof_efficiency: str = "landlord_roof_efficiency" landlord_windows_efficiency: str = "landlord_windows_efficiency" landlord_heating_controls_efficiency: str = "landlord_heating_controls_efficiency" landlord_heating_efficiency: str = "landlord_heating_efficiency" landlord_hot_water_efficiency: str = "landlord_hot_water_efficiency" landlord_wall_efficiency: str = "landlord_wall_efficiency" # Additional windows features landlord_multi_glaze_proportion: str = "landlord_multi_glaze_proportion" landlord_glazed_type: str = "landlord_glazed_type" landlord_glazed_area: str = "landlord_glazed_area" # Additional roof features landlord_has_sloping_ceiling: str = "landlord_has_sloping_ceiling" # Shape, dimensions, age landlord_total_floor_area_m2: str = "landlord_total_floor_area_m2" landlord_construction_age_band: str = "landlord_construction_age_band" landlord_property_type: str = "landlord_property_type" landlord_built_form: str = "landlord_built_form" def read_s3(self, file_format, **kwargs): if self.input_file_name is None or self.bucket_name is None: raise ValueError("Bucket name and input file name must be set before reading from S3.") if file_format == "xlsx": self.data = read_excel_from_s3( bucket_name=self.bucket_name, file_key=self.input_file_name, sheet_name=kwargs.get("sheet_name"), header_row=kwargs.get("header_row", 0) ) else: self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name) def write(self): if self.data is None: raise ValueError("No data to write. Please run transform() before writing.") if self.bucket_name is None or self.output_file_name is None: raise ValueError("Bucket name and output file name must be set before writing to S3.") # Store file as csv - will store in the same route location as the input file save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name) @staticmethod def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool: # We only allow nulls if the original value was null null_vals = data[pd.isnull(data[mapped_column])] if null_vals.empty: return True # We make sure all original values were null assert pd.isnull(null_vals[original_column]).all(), ( f"Some values in {mapped_column} were not mapped, but original values were not null" ) @staticmethod def assert_no_nulls(data: pd.DataFrame, column: str): assert pd.isnull(data[column]).sum() == 0, f"column {column} contains null values, but should not" def map_construction_age_band(self): raise NotImplementedError( "This method should be implemented by subclasses to map construction age bands to descriptions" )