mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
add post sap 10 feature
This commit is contained in:
parent
92fcbe8cdb
commit
6aefd1eb3c
4 changed files with 40 additions and 16 deletions
|
|
@ -4,6 +4,7 @@ import pandas as pd
|
|||
from etl.epc.settings import (
|
||||
DATA_PROCESSOR_SETTINGS,
|
||||
EARLIEST_EPC_DATE,
|
||||
POST_SAP10_DATE,
|
||||
# IGNORED_TRANSACTION_TYPES,
|
||||
IGNORED_FLOOR_LEVELS,
|
||||
IGNORED_PROPERTY_TYPES,
|
||||
|
|
@ -159,6 +160,9 @@ class EPCDataProcessor:
|
|||
# colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
||||
# )
|
||||
|
||||
# Create post sap10 flag
|
||||
self.create_post_sap10_flag()
|
||||
|
||||
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
|
||||
cleaning_averages = self.cleaning_averages.copy()
|
||||
if self.run_mode == "newdata":
|
||||
|
|
@ -175,6 +179,13 @@ class EPCDataProcessor:
|
|||
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
|
||||
self.cast_data_columns_to_lower()
|
||||
|
||||
def create_post_sap10_flag(self):
|
||||
"""
|
||||
Create a flag to indicate if the epc is post sap10
|
||||
"""
|
||||
|
||||
self.data["is_post_sap10"] = self.data["LODGEMENT_DATE"] >= POST_SAP10_DATE
|
||||
|
||||
def cast_data_columns_to_lower(self):
|
||||
"""
|
||||
Convert all columns names to lower
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ from etl.epc.settings import (
|
|||
POTENTIAL_COLUMNS,
|
||||
ROOM_FEATURES,
|
||||
COST_FEATURES,
|
||||
POST_SAP10_FEATURE,
|
||||
)
|
||||
|
||||
# TODO: change in setting file
|
||||
|
|
@ -325,7 +326,9 @@ class EPCPipeline:
|
|||
|
||||
# We include the lodgement date here as we probably need to factor time into the
|
||||
# model, since EPC standards and rigour have changed over time
|
||||
variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES]
|
||||
variable_data = property_data[
|
||||
VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE
|
||||
]
|
||||
|
||||
uprn = str(uprn)
|
||||
epc_records = [
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from etl.epc.settings import (
|
|||
COMPONENT_FEATURES,
|
||||
EFFICIENCY_FEATURES,
|
||||
ROOM_FEATURES,
|
||||
POST_SAP10_FEATURE,
|
||||
)
|
||||
from recommendations.recommendation_utils import estimate_number_of_floors
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
|
|
@ -89,6 +90,7 @@ class EPCRecord:
|
|||
co2_emissions_current: float = None
|
||||
number_habitable_rooms: float = None
|
||||
number_heated_rooms: float = None
|
||||
is_post_sap10: bool = None
|
||||
|
||||
# u_values_walls = None
|
||||
# u_values_roof = None
|
||||
|
|
@ -277,6 +279,7 @@ class EPCRecord:
|
|||
self.number_heated_rooms: float = float(
|
||||
self.prepared_epc["number_heated_rooms"]
|
||||
)
|
||||
self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"])
|
||||
|
||||
def _identify_delta_between_prepared_and_original_records(self):
|
||||
"""
|
||||
|
|
@ -385,11 +388,11 @@ class EPCRecord:
|
|||
return df
|
||||
|
||||
def _clean_floor_height(self):
|
||||
""" Remaps anomalies in floor height to the average floor height for the property type """
|
||||
"""Remaps anomalies in floor height to the average floor height for the property type"""
|
||||
floor_height_data = self.cleaning_data[
|
||||
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) &
|
||||
(self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
|
||||
]
|
||||
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"])
|
||||
& (self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
|
||||
]
|
||||
average = floor_height_data["floor_height"].mean()
|
||||
sd = floor_height_data["floor_height"].std()
|
||||
# If we're in the top 0.5 percentile of floor heights, we'll set it to the average
|
||||
|
|
@ -399,14 +402,16 @@ class EPCRecord:
|
|||
self.prepared_epc["floor-height"] = average
|
||||
|
||||
def _clean_new_build_descriptions(self):
|
||||
for col in ['roof-description', 'walls-description', 'floor-description']:
|
||||
for col in ["roof-description", "walls-description", "floor-description"]:
|
||||
self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K")
|
||||
|
||||
def _clean_constituency(self):
|
||||
"""
|
||||
We handle the single case of finding a missing constituency by using the local authority
|
||||
"""
|
||||
if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""):
|
||||
if pd.isnull(self.prepared_epc["constituency"]) or (
|
||||
self.prepared_epc["constituency"] == ""
|
||||
):
|
||||
if self.prepared_epc["local-authority"] != "E06000044":
|
||||
raise NotImplementedError(
|
||||
"This function is only implemented for Portsmouth, in the single edgecase seen"
|
||||
|
|
@ -595,12 +600,12 @@ class EPCRecord:
|
|||
|
||||
# We handle the edge case of floor area being 0. We set it to zero and it is cleaned by
|
||||
# _clean_with_data_processor
|
||||
if self.prepared_epc['total-floor-area'] == 0:
|
||||
if self.prepared_epc["total-floor-area"] == 0:
|
||||
print(
|
||||
"Edge case of floor area being zero - will set to none and will be cleaned in "
|
||||
"_clean_with_data_processor"
|
||||
)
|
||||
self.prepared_epc['total-floor-area'] = None
|
||||
self.prepared_epc["total-floor-area"] = None
|
||||
|
||||
def _clean_mains_gas(self):
|
||||
"""
|
||||
|
|
@ -609,12 +614,7 @@ class EPCRecord:
|
|||
if not self.prepared_epc:
|
||||
raise ValueError("EPC Recrod doesn not contain epc data")
|
||||
|
||||
mains_gas_map = {
|
||||
"Y": True,
|
||||
"N": False,
|
||||
True: True,
|
||||
False: False
|
||||
}
|
||||
mains_gas_map = {"Y": True, "N": False, True: True, False: False}
|
||||
|
||||
self.prepared_epc["mains-gas-flag"] = (
|
||||
None
|
||||
|
|
@ -1064,7 +1064,12 @@ class EPCDifferenceRecord:
|
|||
CARBON_RESPONSE
|
||||
)
|
||||
|
||||
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES
|
||||
component_variables = (
|
||||
COMPONENT_FEATURES
|
||||
+ EFFICIENCY_FEATURES
|
||||
+ ROOM_FEATURES
|
||||
+ POST_SAP10_FEATURE
|
||||
)
|
||||
ending_record = self.record2.get(
|
||||
component_variables + ["lodgement_date"],
|
||||
return_asdict=True,
|
||||
|
|
|
|||
|
|
@ -52,6 +52,9 @@ DATA_ANOMALY_MATCHES = {
|
|||
"Unknown",
|
||||
}
|
||||
|
||||
# Add the post_sap10 date to indicate if the epc is post sap10
|
||||
POST_SAP10_DATE = "2025-06-22"
|
||||
|
||||
DATA_ANOMALY_SUBSTRINGS = {
|
||||
# Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for
|
||||
# ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained
|
||||
|
|
@ -184,6 +187,8 @@ EFFICIENCY_FEATURES = [
|
|||
|
||||
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
|
||||
|
||||
POST_SAP10_FEATURE = ["is_post_sap10"]
|
||||
|
||||
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
|
||||
"TRANSACTION_TYPE",
|
||||
"ENERGY_TARIFF", # Not sure if this is relevant
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue