ready for review (not deployed

This commit is contained in:
Khalim Conn-Kowlessar 2026-02-05 14:07:43 +00:00
parent a10a3bb1aa
commit 71310526ef
6 changed files with 57 additions and 26 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content> </content>
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Onboarder" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black"> <component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" /> <option name="sdkName" value="Python 3.10 (backend)" />
</component> </component>
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Onboarder" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser"> <component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" /> <option name="shown" value="true" />
</component> </component>

View file

@ -5,15 +5,18 @@ from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3
class OnboarderBase: class OnboarderBase:
# Input dataset to be transformed # Input dataset to be transformed
data: pd.DataFrame | None = None data: pd.DataFrame | None = None
bucket_name = None
input_file_name = None
output_file_name = None
# Description columns # Description columns
landlord_wall_construction: str = "landlord_wall_construction" landlord_wall_construction: str = "landlord_wall_construction"
landlord_roof_construction: str = "landlord_roof_construction" landlord_roof_construction: str = "landlord_roof_construction"
landlord_floor_construction: str = "landlord_floor_construction" landlord_floor_construction: str = "landlord_floor_construction"
landlord_windows_construction: str = "landlord_windows_construction" landlord_windows_type: str = "landlord_windows_type"
landlord_heating_construction: str = "landlord_heating_construction" landlord_heating_construction: str = "landlord_heating_construction"
landlord_fuel_construction: str = "landlord_fuel_construction" landlord_fuel_type: str = "landlord_fuel_type"
landlord_heating_controls_construction: str = "landlord_heating_controls_construction" landlord_heating_controls: str = "landlord_heating_controls"
landlord_hot_water_system_construction: str = "landlord_hot_water_system_construction" landlord_hot_water_system: str = "landlord_hot_water_system"
# Efficiency columns # Efficiency columns
landlord_roof_efficiency: str = "landlord_roof_efficiency" landlord_roof_efficiency: str = "landlord_roof_efficiency"
@ -37,22 +40,28 @@ class OnboarderBase:
landlord_property_type: str = "landlord_property_type" landlord_property_type: str = "landlord_property_type"
landlord_built_form: str = "landlord_built_form" landlord_built_form: str = "landlord_built_form"
def read_s3(self, bucket_name: str, file_name: str, **kwargs): def read_s3(self, file_format, **kwargs):
if kwargs.get("format") == "xlsx":
if self.input_file_name is None or self.bucket_name is None:
raise ValueError("Bucket name and input file name must be set before reading from S3.")
if file_format == "xlsx":
self.data = read_excel_from_s3( self.data = read_excel_from_s3(
bucket_name=bucket_name, bucket_name=self.bucket_name,
file_key=file_name, file_key=self.input_file_name,
sheet_name=kwargs.get("sheet_name"), sheet_name=kwargs.get("sheet_name"),
header_row=kwargs.get("header_row", 0) header_row=kwargs.get("header_row", 0)
) )
else: else:
self.data = read_from_s3(bucket_name=bucket_name, s3_file_name=file_name) self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name)
def write(self, bucket_name: str, file_name: str): def write(self):
if self.data is None: if self.data is None:
raise ValueError("No data to write. Please run transform() before writing.") raise ValueError("No data to write. Please run transform() before writing.")
if self.bucket_name is None or self.output_file_name is None:
raise ValueError("Bucket name and output file name must be set before writing to S3.")
# Store file as csv - will store in the same route location as the input file # Store file as csv - will store in the same route location as the input file
save_csv_to_s3(dataframe=self.data, bucket_name=bucket_name, file_name=file_name) save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name)
@staticmethod @staticmethod
def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool: def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:

View file

@ -36,12 +36,13 @@ def handler(event, context):
onboarder = Onboarder( onboarder = Onboarder(
fileuri=validated_event.s3_uri, fileuri=validated_event.s3_uri,
format=validated_event.format, format=validated_event.format,
sheet_name=validated_event.sheet_name sheet_name=validated_event.sheet_name,
file_format=validated_event.format
) )
logger.info("Transforming data") logger.info("Transforming data")
onboarder.transform() onboarder.transform()
logger.info("Writing data") logger.info(f"Writing data to {onboarder.output_file_name}, bucket: {onboarder.bucket_name}")
onboarder.write() onboarder.write()
except Exception as e: except Exception as e:
logger.error(f"Failed to process record: {e}") logger.error(f"Failed to process record: {e}")

View file

@ -30,13 +30,16 @@ class ParityOnboarder(OnboarderBase):
def __init__( def __init__(
self, self,
fileuri: str, fileuri: str,
file_format: str,
**kwargs **kwargs
): ):
# Extract bucket, and filekey; Will be in the format s3://bucket/key # Extract bucket, and filekey; Will be in the format s3://bucket/key
bucket_name = fileuri.split("/")[2] self.bucket_name = fileuri.split("/")[2]
file_name = "/".join(fileuri.split("/")[3:]) self.input_file_name = "/".join(fileuri.split("/")[3:])
# Also prepare output file name
self.output_file_name = self.input_file_name.replace("." + file_format, "") + "_transformed.csv"
self.read_s3(bucket_name=bucket_name, file_name=file_name, **kwargs) self.read_s3(file_format=file_format, **kwargs)
pass pass
def map_construction_age_band(self): def map_construction_age_band(self):
@ -242,7 +245,7 @@ class ParityOnboarder(OnboarderBase):
# however # however
self.data[ self.data[
[ [
self.landlord_windows_construction, self.landlord_windows_type,
self.landlord_windows_efficiency, self.landlord_windows_efficiency,
self.landlord_multi_glaze_proportion, self.landlord_multi_glaze_proportion,
self.landlord_glazed_type, self.landlord_glazed_type,
@ -261,10 +264,10 @@ class ParityOnboarder(OnboarderBase):
[ [
self.landlord_heating_construction, self.landlord_heating_construction,
self.landlord_heating_efficiency, self.landlord_heating_efficiency,
self.landlord_fuel_construction, self.landlord_fuel_type,
self.landlord_heating_controls_construction, self.landlord_heating_controls,
self.landlord_heating_controls_efficiency, self.landlord_heating_controls_efficiency,
self.landlord_hot_water_system_construction, self.landlord_hot_water_system,
self.landlord_hot_water_efficiency self.landlord_hot_water_efficiency
] ]
] = self.data[ ] = self.data[
@ -301,17 +304,17 @@ class ParityOnboarder(OnboarderBase):
self.landlord_roof_efficiency, self.landlord_roof_efficiency,
self.landlord_has_sloping_ceiling, self.landlord_has_sloping_ceiling,
self.landlord_floor_construction, self.landlord_floor_construction,
self.landlord_windows_construction, self.landlord_windows_type,
self.landlord_windows_efficiency, self.landlord_windows_efficiency,
self.landlord_multi_glaze_proportion, self.landlord_multi_glaze_proportion,
self.landlord_glazed_type, self.landlord_glazed_type,
self.landlord_glazed_area, self.landlord_glazed_area,
self.landlord_heating_construction, self.landlord_heating_construction,
self.landlord_heating_efficiency, self.landlord_heating_efficiency,
self.landlord_fuel_construction, self.landlord_fuel_type,
self.landlord_heating_controls_construction, self.landlord_heating_controls,
self.landlord_heating_controls_efficiency, self.landlord_heating_controls_efficiency,
self.landlord_hot_water_system_construction, self.landlord_hot_water_system,
self.landlord_hot_water_efficiency self.landlord_hot_water_efficiency
] ]
].rename( ].rename(
@ -324,6 +327,17 @@ class ParityOnboarder(OnboarderBase):
} }
) )
def extract_values(self):
for columns in [
self.landlord_construction_age_band, self.landlord_property_type, self.landlord_built_form,
self.landlord_wall_construction, self.landlord_wall_efficiency, self.landlord_roof_construction,
self.landlord_roof_efficiency, self.landlord_floor_construction, self.landlord_windows_type,
self.landlord_windows_efficiency, self.landlord_heating_construction, self.landlord_heating_efficiency,
self.landlord_fuel_type, self.landlord_heating_controls, self.landlord_heating_controls_efficiency,
self.landlord_hot_water_system, self.landlord_hot_water_efficiency
]:
self.data[columns] = self.data[columns].progress_apply(lambda x: x.value if hasattr(x, "value") else x)
def transform(self): def transform(self):
# ------------ construction_age_band ------------ # ------------ construction_age_band ------------
self.map_construction_age_band() self.map_construction_age_band()
@ -354,3 +368,4 @@ class ParityOnboarder(OnboarderBase):
# ------------ Formating ------------ # ------------ Formating ------------
self.select_columns() self.select_columns()
self.extract_values()

View file

@ -0,0 +1,6 @@
boto3
numpy==2.1.2
pandas==2.2.3
tqdm==4.66.5
pydantic==2.9.2
openpyxl==3.1.2