mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
84 lines
3.8 KiB
Python
84 lines
3.8 KiB
Python
import pandas as pd
|
|
from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3
|
|
|
|
|
|
class OnboarderBase:
|
|
# Input dataset to be transformed
|
|
data: pd.DataFrame | None = None
|
|
bucket_name = None
|
|
input_file_name = None
|
|
output_file_name = None
|
|
# Description columns
|
|
landlord_wall_construction: str = "landlord_wall_construction"
|
|
landlord_roof_construction: str = "landlord_roof_construction"
|
|
landlord_floor_construction: str = "landlord_floor_construction"
|
|
landlord_windows_type: str = "landlord_windows_type"
|
|
landlord_heating_system: str = "landlord_heating_system"
|
|
landlord_fuel_type: str = "landlord_fuel_type"
|
|
landlord_heating_controls: str = "landlord_heating_controls"
|
|
landlord_hot_water_system: str = "landlord_hot_water_system"
|
|
|
|
# Efficiency columns
|
|
landlord_roof_efficiency: str = "landlord_roof_efficiency"
|
|
landlord_windows_efficiency: str = "landlord_windows_efficiency"
|
|
landlord_heating_controls_efficiency: str = "landlord_heating_controls_efficiency"
|
|
landlord_heating_efficiency: str = "landlord_heating_efficiency"
|
|
landlord_hot_water_efficiency: str = "landlord_hot_water_efficiency"
|
|
landlord_wall_efficiency: str = "landlord_wall_efficiency"
|
|
|
|
# Additional windows features
|
|
landlord_multi_glaze_proportion: str = "landlord_multi_glaze_proportion"
|
|
landlord_glazed_type: str = "landlord_glazed_type"
|
|
landlord_glazed_area: str = "landlord_glazed_area"
|
|
|
|
# Additional roof features
|
|
landlord_has_sloping_ceiling: str = "landlord_has_sloping_ceiling"
|
|
|
|
# Shape, dimensions, age
|
|
landlord_total_floor_area_m2: str = "landlord_total_floor_area_m2"
|
|
landlord_construction_age_band: str = "landlord_construction_age_band"
|
|
landlord_property_type: str = "landlord_property_type"
|
|
landlord_built_form: str = "landlord_built_form"
|
|
|
|
def read_s3(self, file_format, **kwargs):
|
|
|
|
if self.input_file_name is None or self.bucket_name is None:
|
|
raise ValueError("Bucket name and input file name must be set before reading from S3.")
|
|
if file_format == "xlsx":
|
|
self.data = read_excel_from_s3(
|
|
bucket_name=self.bucket_name,
|
|
file_key=self.input_file_name,
|
|
sheet_name=kwargs.get("sheet_name"),
|
|
header_row=kwargs.get("header_row", 0)
|
|
)
|
|
else:
|
|
self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name)
|
|
|
|
def write(self):
|
|
if self.data is None:
|
|
raise ValueError("No data to write. Please run transform() before writing.")
|
|
|
|
if self.bucket_name is None or self.output_file_name is None:
|
|
raise ValueError("Bucket name and output file name must be set before writing to S3.")
|
|
# Store file as csv - will store in the same route location as the input file
|
|
save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name)
|
|
|
|
@staticmethod
|
|
def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:
|
|
# We only allow nulls if the original value was null
|
|
null_vals = data[pd.isnull(data[mapped_column])]
|
|
if null_vals.empty:
|
|
return True
|
|
# We make sure all original values were null
|
|
assert pd.isnull(null_vals[original_column]).all(), (
|
|
f"Some values in {mapped_column} were not mapped, but original values were not null"
|
|
)
|
|
|
|
@staticmethod
|
|
def assert_no_nulls(data: pd.DataFrame, column: str):
|
|
assert pd.isnull(data[column]).sum() == 0, f"column {column} contains null values, but should not"
|
|
|
|
def map_construction_age_band(self):
|
|
raise NotImplementedError(
|
|
"This method should be implemented by subclasses to map construction age bands to descriptions"
|
|
)
|