Model/backend/onboarders/base.py

import pandas as pd
from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3


class OnboarderBase:
    # Input dataset to be transformed
    data: pd.DataFrame | None = None
    bucket_name = None
    input_file_name = None
    output_file_name = None
    # Description columns
    landlord_wall_construction: str = "landlord_wall_construction"
    landlord_roof_construction: str = "landlord_roof_construction"
    landlord_floor_construction: str = "landlord_floor_construction"
    landlord_windows_type: str = "landlord_windows_type"
    landlord_heating_system: str = "landlord_heating_system"
    landlord_fuel_type: str = "landlord_fuel_type"
    landlord_heating_controls: str = "landlord_heating_controls"
    landlord_hot_water_system: str = "landlord_hot_water_system"

    # Efficiency columns
    landlord_roof_efficiency: str = "landlord_roof_efficiency"
    landlord_windows_efficiency: str = "landlord_windows_efficiency"
    landlord_heating_controls_efficiency: str = "landlord_heating_controls_efficiency"
    landlord_heating_efficiency: str = "landlord_heating_efficiency"
    landlord_hot_water_efficiency: str = "landlord_hot_water_efficiency"
    landlord_wall_efficiency: str = "landlord_wall_efficiency"

    # Additional windows features
    landlord_multi_glaze_proportion: str = "landlord_multi_glaze_proportion"
    landlord_glazed_type: str = "landlord_glazed_type"
    landlord_glazed_area: str = "landlord_glazed_area"

    # Additional roof features
    landlord_has_sloping_ceiling: str = "landlord_has_sloping_ceiling"

    # Shape, dimensions, age
    landlord_total_floor_area_m2: str = "landlord_total_floor_area_m2"
    landlord_construction_age_band: str = "landlord_construction_age_band"
    landlord_property_type: str = "landlord_property_type"
    landlord_built_form: str = "landlord_built_form"

    def read_s3(self, file_format, **kwargs):

        if self.input_file_name is None or self.bucket_name is None:
            raise ValueError("Bucket name and input file name must be set before reading from S3.")
        if file_format == "xlsx":
            self.data = read_excel_from_s3(
                bucket_name=self.bucket_name,
                file_key=self.input_file_name,
                sheet_name=kwargs.get("sheet_name"),
                header_row=kwargs.get("header_row", 0)
            )
        else:
            self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name)

    def write(self):
        if self.data is None:
            raise ValueError("No data to write. Please run transform() before writing.")

        if self.bucket_name is None or self.output_file_name is None:
            raise ValueError("Bucket name and output file name must be set before writing to S3.")
        # Store file as csv - will store in the same route location as the input file
        save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name)

    @staticmethod
    def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:
        # We only allow nulls if the original value was null
        null_vals = data[pd.isnull(data[mapped_column])]
        if null_vals.empty:
            return True
        # We make sure all original values were null
        assert pd.isnull(null_vals[original_column]).all(), (
            f"Some values in {mapped_column} were not mapped, but original values were not null"
        )

    @staticmethod
    def assert_no_nulls(data: pd.DataFrame, column: str):
        assert pd.isnull(data[column]).sum() == 0, f"column {column} contains null values, but should not"

    def map_construction_age_band(self):
        raise NotImplementedError(
            "This method should be implemented by subclasses to map construction age bands to descriptions"
        )