ready for review (not deployed

2026-06-08 11:17:27 +00:00 · 2026-02-05 14:07:43 +00:00 · 2026-02-05 14:07:43 +00:00 · 71310526ef
commit 71310526ef
parent a10a3bb1aa
6 changed files with 57 additions and 26 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Onboarder" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Onboarder" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
--- a/backend/onboarders/base.py
+++ b/backend/onboarders/base.py
@ -5,15 +5,18 @@ from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3
 class OnboarderBase:
    # Input dataset to be transformed
    data: pd.DataFrame | None = None
    bucket_name = None
    input_file_name = None
    output_file_name = None
    # Description columns
    landlord_wall_construction: str = "landlord_wall_construction"
    landlord_roof_construction: str = "landlord_roof_construction"
    landlord_floor_construction: str = "landlord_floor_construction"
-    landlord_windows_construction: str = "landlord_windows_construction"
+    landlord_windows_type: str = "landlord_windows_type"
    landlord_heating_construction: str = "landlord_heating_construction"
-    landlord_fuel_construction: str = "landlord_fuel_construction"
+    landlord_fuel_type: str = "landlord_fuel_type"
-    landlord_heating_controls_construction: str = "landlord_heating_controls_construction"
+    landlord_heating_controls: str = "landlord_heating_controls"
-    landlord_hot_water_system_construction: str = "landlord_hot_water_system_construction"
+    landlord_hot_water_system: str = "landlord_hot_water_system"
    # Efficiency columns
    landlord_roof_efficiency: str = "landlord_roof_efficiency"
@ -37,22 +40,28 @@ class OnboarderBase:
    landlord_property_type: str = "landlord_property_type"
    landlord_built_form: str = "landlord_built_form"
-    def read_s3(self, bucket_name: str, file_name: str, **kwargs):
+    def read_s3(self, file_format, **kwargs):
-        if kwargs.get("format") == "xlsx":
+
        if self.input_file_name is None or self.bucket_name is None:
            raise ValueError("Bucket name and input file name must be set before reading from S3.")
        if file_format == "xlsx":
            self.data = read_excel_from_s3(
-                bucket_name=bucket_name,
+                bucket_name=self.bucket_name,
-                file_key=file_name,
+                file_key=self.input_file_name,
                sheet_name=kwargs.get("sheet_name"),
                header_row=kwargs.get("header_row", 0)
            )
        else:
-            self.data = read_from_s3(bucket_name=bucket_name, s3_file_name=file_name)
+            self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name)
-    def write(self, bucket_name: str, file_name: str):
+    def write(self):
        if self.data is None:
            raise ValueError("No data to write. Please run transform() before writing.")
        if self.bucket_name is None or self.output_file_name is None:
            raise ValueError("Bucket name and output file name must be set before writing to S3.")
        # Store file as csv - will store in the same route location as the input file
-        save_csv_to_s3(dataframe=self.data, bucket_name=bucket_name, file_name=file_name)
+        save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name)
    @staticmethod
    def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:
--- a/backend/onboarders/handler.py
+++ b/backend/onboarders/handler.py
@ -36,12 +36,13 @@ def handler(event, context):
            onboarder = Onboarder(
                fileuri=validated_event.s3_uri,
                format=validated_event.format,
-                sheet_name=validated_event.sheet_name
+                sheet_name=validated_event.sheet_name,
                file_format=validated_event.format
            )
            logger.info("Transforming data")
            onboarder.transform()
-            logger.info("Writing data")
+            logger.info(f"Writing data to {onboarder.output_file_name}, bucket: {onboarder.bucket_name}")
            onboarder.write()
        except Exception as e:
            logger.error(f"Failed to process record: {e}")
--- a/backend/onboarders/parity.py
+++ b/backend/onboarders/parity.py
@ -30,13 +30,16 @@ class ParityOnboarder(OnboarderBase):
    def __init__(
        self,
        fileuri: str,
        file_format: str,
        **kwargs
    ):
        # Extract bucket, and filekey; Will be in the format s3://bucket/key
-        bucket_name = fileuri.split("/")[2]
+        self.bucket_name = fileuri.split("/")[2]
-        file_name = "/".join(fileuri.split("/")[3:])
+        self.input_file_name = "/".join(fileuri.split("/")[3:])
        # Also prepare output file name
        self.output_file_name = self.input_file_name.replace("." + file_format, "") + "_transformed.csv"
-        self.read_s3(bucket_name=bucket_name, file_name=file_name, **kwargs)
+        self.read_s3(file_format=file_format, **kwargs)
        pass
    def map_construction_age_band(self):
@ -242,7 +245,7 @@ class ParityOnboarder(OnboarderBase):
        #       however
        self.data[
            [
-                self.landlord_windows_construction,
+                self.landlord_windows_type,
                self.landlord_windows_efficiency,
                self.landlord_multi_glaze_proportion,
                self.landlord_glazed_type,
@ -261,10 +264,10 @@ class ParityOnboarder(OnboarderBase):
            [
                self.landlord_heating_construction,
                self.landlord_heating_efficiency,
-                self.landlord_fuel_construction,
+                self.landlord_fuel_type,
-                self.landlord_heating_controls_construction,
+                self.landlord_heating_controls,
                self.landlord_heating_controls_efficiency,
-                self.landlord_hot_water_system_construction,
+                self.landlord_hot_water_system,
                self.landlord_hot_water_efficiency
            ]
        ] = self.data[
@ -301,17 +304,17 @@ class ParityOnboarder(OnboarderBase):
                self.landlord_roof_efficiency,
                self.landlord_has_sloping_ceiling,
                self.landlord_floor_construction,
-                self.landlord_windows_construction,
+                self.landlord_windows_type,
                self.landlord_windows_efficiency,
                self.landlord_multi_glaze_proportion,
                self.landlord_glazed_type,
                self.landlord_glazed_area,
                self.landlord_heating_construction,
                self.landlord_heating_efficiency,
-                self.landlord_fuel_construction,
+                self.landlord_fuel_type,
-                self.landlord_heating_controls_construction,
+                self.landlord_heating_controls,
                self.landlord_heating_controls_efficiency,
-                self.landlord_hot_water_system_construction,
+                self.landlord_hot_water_system,
                self.landlord_hot_water_efficiency
            ]
        ].rename(
@ -324,6 +327,17 @@ class ParityOnboarder(OnboarderBase):
            }
        )
    def extract_values(self):
        for columns in [
            self.landlord_construction_age_band, self.landlord_property_type, self.landlord_built_form,
            self.landlord_wall_construction, self.landlord_wall_efficiency, self.landlord_roof_construction,
            self.landlord_roof_efficiency, self.landlord_floor_construction, self.landlord_windows_type,
            self.landlord_windows_efficiency, self.landlord_heating_construction, self.landlord_heating_efficiency,
            self.landlord_fuel_type, self.landlord_heating_controls, self.landlord_heating_controls_efficiency,
            self.landlord_hot_water_system, self.landlord_hot_water_efficiency
        ]:
            self.data[columns] = self.data[columns].progress_apply(lambda x: x.value if hasattr(x, "value") else x)
    def transform(self):
        # ------------ construction_age_band ------------
        self.map_construction_age_band()
@ -354,3 +368,4 @@ class ParityOnboarder(OnboarderBase):
        # ------------ Formating ------------
        self.select_columns()
        self.extract_values()
--- a/backend/onboarders/requirements.txt
+++ b/backend/onboarders/requirements.txt
@ -0,0 +1,6 @@
 boto3
 numpy==2.1.2
 pandas==2.2.3
 tqdm==4.66.5
 pydantic==2.9.2
 openpyxl==3.1.2