refactoring construction of the attributes

This commit is contained in:
Khalim Conn-Kowlessar 2025-02-20 09:20:25 +00:00
parent ed333e1714
commit 8bf6aa5af2

View file

@ -21,6 +21,8 @@ from recommendations.recommendation_utils import (
estimate_number_of_floors
)
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
logger = setup_logger()
# OpenAI API Key (set this in your environment variables for security)
@ -279,9 +281,19 @@ class AssetList:
"Any further surveyor notes", 'Surveyors Name'
]
# This SAP threshold is a key search criteria for properties that may be eligible for extraction
SAP_RATING_THRESHOLD = 75
# Any EPC deemed to have been conducted prior to this year is deemed to be unreliable
EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5
# Attributes - these are columns that we produce, calcualted based on other pieces of data
ATTRIBUTE_HAS_SOLAR = "attribute_has_solar"
ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors"
ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter"
ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area"
ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness"
ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{SAP_RATING_THRESHOLD}_and_below"
ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}"
def __init__(
self,
@ -672,3 +684,56 @@ class AssetList:
),
axis=1
)
self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = (
self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float)
)
# Replace "" value with None
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None)
)
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float)
)
# Estimate the perimeter
self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply(
lambda x: estimate_perimeter(
floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
), axis=1
)
self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply(
lambda x: estimate_external_wall_area(
num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
floor_height=(
float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if
x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5
),
perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER],
built_form=x[self.EPC_API_DATA_NAMES["built-form"]]
),
axis=1
)
self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply(
lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[
"insulation_thickness"] if not pd.isnull(
x[self.EPC_API_DATA_NAMES["roof-description"]]) else None,
axis=1
)
# We produce some additional fields
# 1) Is the SAP rating below C75
self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = (
self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <=
self.SAP_RATING_THRESHOLD
)
# 2) Flag anything where the EPC is older than 5 years
self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = (
pd.to_datetime(
self.standardised_asset_list[self.EPC_API_DATA_NAMES["lodgement-date"]]
).dt.year < self.EPC_YEAR_THRESHOLD
)