mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
ready for review (not deployed
This commit is contained in:
parent
a10a3bb1aa
commit
71310526ef
6 changed files with 57 additions and 26 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
||||||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Onboarder" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
||||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
||||||
<component name="Black">
|
<component name="Black">
|
||||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||||
</component>
|
</component>
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Onboarder" project-jdk-type="Python SDK" />
|
||||||
<component name="PyCharmProfessionalAdvertiser">
|
<component name="PyCharmProfessionalAdvertiser">
|
||||||
<option name="shown" value="true" />
|
<option name="shown" value="true" />
|
||||||
</component>
|
</component>
|
||||||
|
|
|
||||||
|
|
@ -5,15 +5,18 @@ from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3
|
||||||
class OnboarderBase:
|
class OnboarderBase:
|
||||||
# Input dataset to be transformed
|
# Input dataset to be transformed
|
||||||
data: pd.DataFrame | None = None
|
data: pd.DataFrame | None = None
|
||||||
|
bucket_name = None
|
||||||
|
input_file_name = None
|
||||||
|
output_file_name = None
|
||||||
# Description columns
|
# Description columns
|
||||||
landlord_wall_construction: str = "landlord_wall_construction"
|
landlord_wall_construction: str = "landlord_wall_construction"
|
||||||
landlord_roof_construction: str = "landlord_roof_construction"
|
landlord_roof_construction: str = "landlord_roof_construction"
|
||||||
landlord_floor_construction: str = "landlord_floor_construction"
|
landlord_floor_construction: str = "landlord_floor_construction"
|
||||||
landlord_windows_construction: str = "landlord_windows_construction"
|
landlord_windows_type: str = "landlord_windows_type"
|
||||||
landlord_heating_construction: str = "landlord_heating_construction"
|
landlord_heating_construction: str = "landlord_heating_construction"
|
||||||
landlord_fuel_construction: str = "landlord_fuel_construction"
|
landlord_fuel_type: str = "landlord_fuel_type"
|
||||||
landlord_heating_controls_construction: str = "landlord_heating_controls_construction"
|
landlord_heating_controls: str = "landlord_heating_controls"
|
||||||
landlord_hot_water_system_construction: str = "landlord_hot_water_system_construction"
|
landlord_hot_water_system: str = "landlord_hot_water_system"
|
||||||
|
|
||||||
# Efficiency columns
|
# Efficiency columns
|
||||||
landlord_roof_efficiency: str = "landlord_roof_efficiency"
|
landlord_roof_efficiency: str = "landlord_roof_efficiency"
|
||||||
|
|
@ -37,22 +40,28 @@ class OnboarderBase:
|
||||||
landlord_property_type: str = "landlord_property_type"
|
landlord_property_type: str = "landlord_property_type"
|
||||||
landlord_built_form: str = "landlord_built_form"
|
landlord_built_form: str = "landlord_built_form"
|
||||||
|
|
||||||
def read_s3(self, bucket_name: str, file_name: str, **kwargs):
|
def read_s3(self, file_format, **kwargs):
|
||||||
if kwargs.get("format") == "xlsx":
|
|
||||||
|
if self.input_file_name is None or self.bucket_name is None:
|
||||||
|
raise ValueError("Bucket name and input file name must be set before reading from S3.")
|
||||||
|
if file_format == "xlsx":
|
||||||
self.data = read_excel_from_s3(
|
self.data = read_excel_from_s3(
|
||||||
bucket_name=bucket_name,
|
bucket_name=self.bucket_name,
|
||||||
file_key=file_name,
|
file_key=self.input_file_name,
|
||||||
sheet_name=kwargs.get("sheet_name"),
|
sheet_name=kwargs.get("sheet_name"),
|
||||||
header_row=kwargs.get("header_row", 0)
|
header_row=kwargs.get("header_row", 0)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.data = read_from_s3(bucket_name=bucket_name, s3_file_name=file_name)
|
self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name)
|
||||||
|
|
||||||
def write(self, bucket_name: str, file_name: str):
|
def write(self):
|
||||||
if self.data is None:
|
if self.data is None:
|
||||||
raise ValueError("No data to write. Please run transform() before writing.")
|
raise ValueError("No data to write. Please run transform() before writing.")
|
||||||
|
|
||||||
|
if self.bucket_name is None or self.output_file_name is None:
|
||||||
|
raise ValueError("Bucket name and output file name must be set before writing to S3.")
|
||||||
# Store file as csv - will store in the same route location as the input file
|
# Store file as csv - will store in the same route location as the input file
|
||||||
save_csv_to_s3(dataframe=self.data, bucket_name=bucket_name, file_name=file_name)
|
save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:
|
def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool:
|
||||||
|
|
|
||||||
|
|
@ -36,12 +36,13 @@ def handler(event, context):
|
||||||
onboarder = Onboarder(
|
onboarder = Onboarder(
|
||||||
fileuri=validated_event.s3_uri,
|
fileuri=validated_event.s3_uri,
|
||||||
format=validated_event.format,
|
format=validated_event.format,
|
||||||
sheet_name=validated_event.sheet_name
|
sheet_name=validated_event.sheet_name,
|
||||||
|
file_format=validated_event.format
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Transforming data")
|
logger.info("Transforming data")
|
||||||
onboarder.transform()
|
onboarder.transform()
|
||||||
logger.info("Writing data")
|
logger.info(f"Writing data to {onboarder.output_file_name}, bucket: {onboarder.bucket_name}")
|
||||||
onboarder.write()
|
onboarder.write()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to process record: {e}")
|
logger.error(f"Failed to process record: {e}")
|
||||||
|
|
|
||||||
|
|
@ -30,13 +30,16 @@ class ParityOnboarder(OnboarderBase):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
fileuri: str,
|
fileuri: str,
|
||||||
|
file_format: str,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
# Extract bucket, and filekey; Will be in the format s3://bucket/key
|
# Extract bucket, and filekey; Will be in the format s3://bucket/key
|
||||||
bucket_name = fileuri.split("/")[2]
|
self.bucket_name = fileuri.split("/")[2]
|
||||||
file_name = "/".join(fileuri.split("/")[3:])
|
self.input_file_name = "/".join(fileuri.split("/")[3:])
|
||||||
|
# Also prepare output file name
|
||||||
|
self.output_file_name = self.input_file_name.replace("." + file_format, "") + "_transformed.csv"
|
||||||
|
|
||||||
self.read_s3(bucket_name=bucket_name, file_name=file_name, **kwargs)
|
self.read_s3(file_format=file_format, **kwargs)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def map_construction_age_band(self):
|
def map_construction_age_band(self):
|
||||||
|
|
@ -242,7 +245,7 @@ class ParityOnboarder(OnboarderBase):
|
||||||
# however
|
# however
|
||||||
self.data[
|
self.data[
|
||||||
[
|
[
|
||||||
self.landlord_windows_construction,
|
self.landlord_windows_type,
|
||||||
self.landlord_windows_efficiency,
|
self.landlord_windows_efficiency,
|
||||||
self.landlord_multi_glaze_proportion,
|
self.landlord_multi_glaze_proportion,
|
||||||
self.landlord_glazed_type,
|
self.landlord_glazed_type,
|
||||||
|
|
@ -261,10 +264,10 @@ class ParityOnboarder(OnboarderBase):
|
||||||
[
|
[
|
||||||
self.landlord_heating_construction,
|
self.landlord_heating_construction,
|
||||||
self.landlord_heating_efficiency,
|
self.landlord_heating_efficiency,
|
||||||
self.landlord_fuel_construction,
|
self.landlord_fuel_type,
|
||||||
self.landlord_heating_controls_construction,
|
self.landlord_heating_controls,
|
||||||
self.landlord_heating_controls_efficiency,
|
self.landlord_heating_controls_efficiency,
|
||||||
self.landlord_hot_water_system_construction,
|
self.landlord_hot_water_system,
|
||||||
self.landlord_hot_water_efficiency
|
self.landlord_hot_water_efficiency
|
||||||
]
|
]
|
||||||
] = self.data[
|
] = self.data[
|
||||||
|
|
@ -301,17 +304,17 @@ class ParityOnboarder(OnboarderBase):
|
||||||
self.landlord_roof_efficiency,
|
self.landlord_roof_efficiency,
|
||||||
self.landlord_has_sloping_ceiling,
|
self.landlord_has_sloping_ceiling,
|
||||||
self.landlord_floor_construction,
|
self.landlord_floor_construction,
|
||||||
self.landlord_windows_construction,
|
self.landlord_windows_type,
|
||||||
self.landlord_windows_efficiency,
|
self.landlord_windows_efficiency,
|
||||||
self.landlord_multi_glaze_proportion,
|
self.landlord_multi_glaze_proportion,
|
||||||
self.landlord_glazed_type,
|
self.landlord_glazed_type,
|
||||||
self.landlord_glazed_area,
|
self.landlord_glazed_area,
|
||||||
self.landlord_heating_construction,
|
self.landlord_heating_construction,
|
||||||
self.landlord_heating_efficiency,
|
self.landlord_heating_efficiency,
|
||||||
self.landlord_fuel_construction,
|
self.landlord_fuel_type,
|
||||||
self.landlord_heating_controls_construction,
|
self.landlord_heating_controls,
|
||||||
self.landlord_heating_controls_efficiency,
|
self.landlord_heating_controls_efficiency,
|
||||||
self.landlord_hot_water_system_construction,
|
self.landlord_hot_water_system,
|
||||||
self.landlord_hot_water_efficiency
|
self.landlord_hot_water_efficiency
|
||||||
]
|
]
|
||||||
].rename(
|
].rename(
|
||||||
|
|
@ -324,6 +327,17 @@ class ParityOnboarder(OnboarderBase):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def extract_values(self):
|
||||||
|
for columns in [
|
||||||
|
self.landlord_construction_age_band, self.landlord_property_type, self.landlord_built_form,
|
||||||
|
self.landlord_wall_construction, self.landlord_wall_efficiency, self.landlord_roof_construction,
|
||||||
|
self.landlord_roof_efficiency, self.landlord_floor_construction, self.landlord_windows_type,
|
||||||
|
self.landlord_windows_efficiency, self.landlord_heating_construction, self.landlord_heating_efficiency,
|
||||||
|
self.landlord_fuel_type, self.landlord_heating_controls, self.landlord_heating_controls_efficiency,
|
||||||
|
self.landlord_hot_water_system, self.landlord_hot_water_efficiency
|
||||||
|
]:
|
||||||
|
self.data[columns] = self.data[columns].progress_apply(lambda x: x.value if hasattr(x, "value") else x)
|
||||||
|
|
||||||
def transform(self):
|
def transform(self):
|
||||||
# ------------ construction_age_band ------------
|
# ------------ construction_age_band ------------
|
||||||
self.map_construction_age_band()
|
self.map_construction_age_band()
|
||||||
|
|
@ -354,3 +368,4 @@ class ParityOnboarder(OnboarderBase):
|
||||||
|
|
||||||
# ------------ Formating ------------
|
# ------------ Formating ------------
|
||||||
self.select_columns()
|
self.select_columns()
|
||||||
|
self.extract_values()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
boto3
|
||||||
|
numpy==2.1.2
|
||||||
|
pandas==2.2.3
|
||||||
|
tqdm==4.66.5
|
||||||
|
pydantic==2.9.2
|
||||||
|
openpyxl==3.1.2
|
||||||
Loading…
Add table
Reference in a new issue