diff --git a/.idea/Model.iml b/.idea/Model.iml
index c6561970..a079cfaf 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..b6084632 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
-
+
diff --git a/backend/onboarders/base.py b/backend/onboarders/base.py
index 4d09cfeb..93a0b7b0 100644
--- a/backend/onboarders/base.py
+++ b/backend/onboarders/base.py
@@ -5,15 +5,18 @@ from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3
class OnboarderBase:
# Input dataset to be transformed
data: pd.DataFrame | None = None
+ bucket_name = None
+ input_file_name = None
+ output_file_name = None
# Description columns
landlord_wall_construction: str = "landlord_wall_construction"
landlord_roof_construction: str = "landlord_roof_construction"
landlord_floor_construction: str = "landlord_floor_construction"
- landlord_windows_construction: str = "landlord_windows_construction"
+ landlord_windows_type: str = "landlord_windows_type"
landlord_heating_construction: str = "landlord_heating_construction"
- landlord_fuel_construction: str = "landlord_fuel_construction"
- landlord_heating_controls_construction: str = "landlord_heating_controls_construction"
- landlord_hot_water_system_construction: str = "landlord_hot_water_system_construction"
+ landlord_fuel_type: str = "landlord_fuel_type"
+ landlord_heating_controls: str = "landlord_heating_controls"
+ landlord_hot_water_system: str = "landlord_hot_water_system"
# Efficiency columns
landlord_roof_efficiency: str = "landlord_roof_efficiency"
@@ -37,22 +40,28 @@ class OnboarderBase:
landlord_property_type: str = "landlord_property_type"
landlord_built_form: str = "landlord_built_form"
- def read_s3(self, bucket_name: str, file_name: str, **kwargs):
- if kwargs.get("format") == "xlsx":
+ def read_s3(self, file_format, **kwargs):
+
+ if self.input_file_name is None or self.bucket_name is None:
+ raise ValueError("Bucket name and input file name must be set before reading from S3.")
+ if file_format == "xlsx":
self.data = read_excel_from_s3(
- bucket_name=bucket_name,
- file_key=file_name,
+ bucket_name=self.bucket_name,
+ file_key=self.input_file_name,
sheet_name=kwargs.get("sheet_name"),
header_row=kwargs.get("header_row", 0)
)
else:
- self.data = read_from_s3(bucket_name=bucket_name, s3_file_name=file_name)
+ self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name)
- def write(self, bucket_name: str, file_name: str):
+ def write(self):
if self.data is None:
raise ValueError("No data to write. Please run transform() before writing.")
+
+ if self.bucket_name is None or self.output_file_name is None:
+ raise ValueError("Bucket name and output file name must be set before writing to S3.")
# Store file as csv - will store in the same route location as the input file
- save_csv_to_s3(dataframe=self.data, bucket_name=bucket_name, file_name=file_name)
+ save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name)
@staticmethod
def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:
diff --git a/backend/onboarders/handler.py b/backend/onboarders/handler.py
index dfff7788..be7c2f38 100644
--- a/backend/onboarders/handler.py
+++ b/backend/onboarders/handler.py
@@ -36,12 +36,13 @@ def handler(event, context):
onboarder = Onboarder(
fileuri=validated_event.s3_uri,
format=validated_event.format,
- sheet_name=validated_event.sheet_name
+ sheet_name=validated_event.sheet_name,
+ file_format=validated_event.format
)
logger.info("Transforming data")
onboarder.transform()
- logger.info("Writing data")
+ logger.info(f"Writing data to {onboarder.output_file_name}, bucket: {onboarder.bucket_name}")
onboarder.write()
except Exception as e:
logger.error(f"Failed to process record: {e}")
diff --git a/backend/onboarders/parity.py b/backend/onboarders/parity.py
index 2afc7a73..6c79d027 100644
--- a/backend/onboarders/parity.py
+++ b/backend/onboarders/parity.py
@@ -30,13 +30,16 @@ class ParityOnboarder(OnboarderBase):
def __init__(
self,
fileuri: str,
+ file_format: str,
**kwargs
):
# Extract bucket, and filekey; Will be in the format s3://bucket/key
- bucket_name = fileuri.split("/")[2]
- file_name = "/".join(fileuri.split("/")[3:])
+ self.bucket_name = fileuri.split("/")[2]
+ self.input_file_name = "/".join(fileuri.split("/")[3:])
+ # Also prepare output file name
+ self.output_file_name = self.input_file_name.replace("." + file_format, "") + "_transformed.csv"
- self.read_s3(bucket_name=bucket_name, file_name=file_name, **kwargs)
+ self.read_s3(file_format=file_format, **kwargs)
pass
def map_construction_age_band(self):
@@ -242,7 +245,7 @@ class ParityOnboarder(OnboarderBase):
# however
self.data[
[
- self.landlord_windows_construction,
+ self.landlord_windows_type,
self.landlord_windows_efficiency,
self.landlord_multi_glaze_proportion,
self.landlord_glazed_type,
@@ -261,10 +264,10 @@ class ParityOnboarder(OnboarderBase):
[
self.landlord_heating_construction,
self.landlord_heating_efficiency,
- self.landlord_fuel_construction,
- self.landlord_heating_controls_construction,
+ self.landlord_fuel_type,
+ self.landlord_heating_controls,
self.landlord_heating_controls_efficiency,
- self.landlord_hot_water_system_construction,
+ self.landlord_hot_water_system,
self.landlord_hot_water_efficiency
]
] = self.data[
@@ -301,17 +304,17 @@ class ParityOnboarder(OnboarderBase):
self.landlord_roof_efficiency,
self.landlord_has_sloping_ceiling,
self.landlord_floor_construction,
- self.landlord_windows_construction,
+ self.landlord_windows_type,
self.landlord_windows_efficiency,
self.landlord_multi_glaze_proportion,
self.landlord_glazed_type,
self.landlord_glazed_area,
self.landlord_heating_construction,
self.landlord_heating_efficiency,
- self.landlord_fuel_construction,
- self.landlord_heating_controls_construction,
+ self.landlord_fuel_type,
+ self.landlord_heating_controls,
self.landlord_heating_controls_efficiency,
- self.landlord_hot_water_system_construction,
+ self.landlord_hot_water_system,
self.landlord_hot_water_efficiency
]
].rename(
@@ -324,6 +327,17 @@ class ParityOnboarder(OnboarderBase):
}
)
+ def extract_values(self):
+ for columns in [
+ self.landlord_construction_age_band, self.landlord_property_type, self.landlord_built_form,
+ self.landlord_wall_construction, self.landlord_wall_efficiency, self.landlord_roof_construction,
+ self.landlord_roof_efficiency, self.landlord_floor_construction, self.landlord_windows_type,
+ self.landlord_windows_efficiency, self.landlord_heating_construction, self.landlord_heating_efficiency,
+ self.landlord_fuel_type, self.landlord_heating_controls, self.landlord_heating_controls_efficiency,
+ self.landlord_hot_water_system, self.landlord_hot_water_efficiency
+ ]:
+ self.data[columns] = self.data[columns].progress_apply(lambda x: x.value if hasattr(x, "value") else x)
+
def transform(self):
# ------------ construction_age_band ------------
self.map_construction_age_band()
@@ -354,3 +368,4 @@ class ParityOnboarder(OnboarderBase):
# ------------ Formating ------------
self.select_columns()
+ self.extract_values()
diff --git a/backend/onboarders/requirements.txt b/backend/onboarders/requirements.txt
index e69de29b..907cb877 100644
--- a/backend/onboarders/requirements.txt
+++ b/backend/onboarders/requirements.txt
@@ -0,0 +1,6 @@
+boto3
+numpy==2.1.2
+pandas==2.2.3
+tqdm==4.66.5
+pydantic==2.9.2
+openpyxl==3.1.2
\ No newline at end of file