mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
ready for review (not deployed
This commit is contained in:
parent
a10a3bb1aa
commit
71310526ef
6 changed files with 57 additions and 26 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Onboarder" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Onboarder" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -5,15 +5,18 @@ from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3
|
|||
class OnboarderBase:
|
||||
# Input dataset to be transformed
|
||||
data: pd.DataFrame | None = None
|
||||
bucket_name = None
|
||||
input_file_name = None
|
||||
output_file_name = None
|
||||
# Description columns
|
||||
landlord_wall_construction: str = "landlord_wall_construction"
|
||||
landlord_roof_construction: str = "landlord_roof_construction"
|
||||
landlord_floor_construction: str = "landlord_floor_construction"
|
||||
landlord_windows_construction: str = "landlord_windows_construction"
|
||||
landlord_windows_type: str = "landlord_windows_type"
|
||||
landlord_heating_construction: str = "landlord_heating_construction"
|
||||
landlord_fuel_construction: str = "landlord_fuel_construction"
|
||||
landlord_heating_controls_construction: str = "landlord_heating_controls_construction"
|
||||
landlord_hot_water_system_construction: str = "landlord_hot_water_system_construction"
|
||||
landlord_fuel_type: str = "landlord_fuel_type"
|
||||
landlord_heating_controls: str = "landlord_heating_controls"
|
||||
landlord_hot_water_system: str = "landlord_hot_water_system"
|
||||
|
||||
# Efficiency columns
|
||||
landlord_roof_efficiency: str = "landlord_roof_efficiency"
|
||||
|
|
@ -37,22 +40,28 @@ class OnboarderBase:
|
|||
landlord_property_type: str = "landlord_property_type"
|
||||
landlord_built_form: str = "landlord_built_form"
|
||||
|
||||
def read_s3(self, bucket_name: str, file_name: str, **kwargs):
|
||||
if kwargs.get("format") == "xlsx":
|
||||
def read_s3(self, file_format, **kwargs):
|
||||
|
||||
if self.input_file_name is None or self.bucket_name is None:
|
||||
raise ValueError("Bucket name and input file name must be set before reading from S3.")
|
||||
if file_format == "xlsx":
|
||||
self.data = read_excel_from_s3(
|
||||
bucket_name=bucket_name,
|
||||
file_key=file_name,
|
||||
bucket_name=self.bucket_name,
|
||||
file_key=self.input_file_name,
|
||||
sheet_name=kwargs.get("sheet_name"),
|
||||
header_row=kwargs.get("header_row", 0)
|
||||
)
|
||||
else:
|
||||
self.data = read_from_s3(bucket_name=bucket_name, s3_file_name=file_name)
|
||||
self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name)
|
||||
|
||||
def write(self, bucket_name: str, file_name: str):
|
||||
def write(self):
|
||||
if self.data is None:
|
||||
raise ValueError("No data to write. Please run transform() before writing.")
|
||||
|
||||
if self.bucket_name is None or self.output_file_name is None:
|
||||
raise ValueError("Bucket name and output file name must be set before writing to S3.")
|
||||
# Store file as csv - will store in the same route location as the input file
|
||||
save_csv_to_s3(dataframe=self.data, bucket_name=bucket_name, file_name=file_name)
|
||||
save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name)
|
||||
|
||||
@staticmethod
|
||||
def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:
|
||||
|
|
|
|||
|
|
@ -36,12 +36,13 @@ def handler(event, context):
|
|||
onboarder = Onboarder(
|
||||
fileuri=validated_event.s3_uri,
|
||||
format=validated_event.format,
|
||||
sheet_name=validated_event.sheet_name
|
||||
sheet_name=validated_event.sheet_name,
|
||||
file_format=validated_event.format
|
||||
)
|
||||
|
||||
logger.info("Transforming data")
|
||||
onboarder.transform()
|
||||
logger.info("Writing data")
|
||||
logger.info(f"Writing data to {onboarder.output_file_name}, bucket: {onboarder.bucket_name}")
|
||||
onboarder.write()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process record: {e}")
|
||||
|
|
|
|||
|
|
@ -30,13 +30,16 @@ class ParityOnboarder(OnboarderBase):
|
|||
def __init__(
|
||||
self,
|
||||
fileuri: str,
|
||||
file_format: str,
|
||||
**kwargs
|
||||
):
|
||||
# Extract bucket, and filekey; Will be in the format s3://bucket/key
|
||||
bucket_name = fileuri.split("/")[2]
|
||||
file_name = "/".join(fileuri.split("/")[3:])
|
||||
self.bucket_name = fileuri.split("/")[2]
|
||||
self.input_file_name = "/".join(fileuri.split("/")[3:])
|
||||
# Also prepare output file name
|
||||
self.output_file_name = self.input_file_name.replace("." + file_format, "") + "_transformed.csv"
|
||||
|
||||
self.read_s3(bucket_name=bucket_name, file_name=file_name, **kwargs)
|
||||
self.read_s3(file_format=file_format, **kwargs)
|
||||
pass
|
||||
|
||||
def map_construction_age_band(self):
|
||||
|
|
@ -242,7 +245,7 @@ class ParityOnboarder(OnboarderBase):
|
|||
# however
|
||||
self.data[
|
||||
[
|
||||
self.landlord_windows_construction,
|
||||
self.landlord_windows_type,
|
||||
self.landlord_windows_efficiency,
|
||||
self.landlord_multi_glaze_proportion,
|
||||
self.landlord_glazed_type,
|
||||
|
|
@ -261,10 +264,10 @@ class ParityOnboarder(OnboarderBase):
|
|||
[
|
||||
self.landlord_heating_construction,
|
||||
self.landlord_heating_efficiency,
|
||||
self.landlord_fuel_construction,
|
||||
self.landlord_heating_controls_construction,
|
||||
self.landlord_fuel_type,
|
||||
self.landlord_heating_controls,
|
||||
self.landlord_heating_controls_efficiency,
|
||||
self.landlord_hot_water_system_construction,
|
||||
self.landlord_hot_water_system,
|
||||
self.landlord_hot_water_efficiency
|
||||
]
|
||||
] = self.data[
|
||||
|
|
@ -301,17 +304,17 @@ class ParityOnboarder(OnboarderBase):
|
|||
self.landlord_roof_efficiency,
|
||||
self.landlord_has_sloping_ceiling,
|
||||
self.landlord_floor_construction,
|
||||
self.landlord_windows_construction,
|
||||
self.landlord_windows_type,
|
||||
self.landlord_windows_efficiency,
|
||||
self.landlord_multi_glaze_proportion,
|
||||
self.landlord_glazed_type,
|
||||
self.landlord_glazed_area,
|
||||
self.landlord_heating_construction,
|
||||
self.landlord_heating_efficiency,
|
||||
self.landlord_fuel_construction,
|
||||
self.landlord_heating_controls_construction,
|
||||
self.landlord_fuel_type,
|
||||
self.landlord_heating_controls,
|
||||
self.landlord_heating_controls_efficiency,
|
||||
self.landlord_hot_water_system_construction,
|
||||
self.landlord_hot_water_system,
|
||||
self.landlord_hot_water_efficiency
|
||||
]
|
||||
].rename(
|
||||
|
|
@ -324,6 +327,17 @@ class ParityOnboarder(OnboarderBase):
|
|||
}
|
||||
)
|
||||
|
||||
def extract_values(self):
|
||||
for columns in [
|
||||
self.landlord_construction_age_band, self.landlord_property_type, self.landlord_built_form,
|
||||
self.landlord_wall_construction, self.landlord_wall_efficiency, self.landlord_roof_construction,
|
||||
self.landlord_roof_efficiency, self.landlord_floor_construction, self.landlord_windows_type,
|
||||
self.landlord_windows_efficiency, self.landlord_heating_construction, self.landlord_heating_efficiency,
|
||||
self.landlord_fuel_type, self.landlord_heating_controls, self.landlord_heating_controls_efficiency,
|
||||
self.landlord_hot_water_system, self.landlord_hot_water_efficiency
|
||||
]:
|
||||
self.data[columns] = self.data[columns].progress_apply(lambda x: x.value if hasattr(x, "value") else x)
|
||||
|
||||
def transform(self):
|
||||
# ------------ construction_age_band ------------
|
||||
self.map_construction_age_band()
|
||||
|
|
@ -354,3 +368,4 @@ class ParityOnboarder(OnboarderBase):
|
|||
|
||||
# ------------ Formating ------------
|
||||
self.select_columns()
|
||||
self.extract_values()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,6 @@
|
|||
boto3
|
||||
numpy==2.1.2
|
||||
pandas==2.2.3
|
||||
tqdm==4.66.5
|
||||
pydantic==2.9.2
|
||||
openpyxl==3.1.2
|
||||
Loading…
Add table
Reference in a new issue