mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
add post sap 10 feature
This commit is contained in:
parent
92fcbe8cdb
commit
6aefd1eb3c
4 changed files with 40 additions and 16 deletions
|
|
@ -4,6 +4,7 @@ import pandas as pd
|
||||||
from etl.epc.settings import (
|
from etl.epc.settings import (
|
||||||
DATA_PROCESSOR_SETTINGS,
|
DATA_PROCESSOR_SETTINGS,
|
||||||
EARLIEST_EPC_DATE,
|
EARLIEST_EPC_DATE,
|
||||||
|
POST_SAP10_DATE,
|
||||||
# IGNORED_TRANSACTION_TYPES,
|
# IGNORED_TRANSACTION_TYPES,
|
||||||
IGNORED_FLOOR_LEVELS,
|
IGNORED_FLOOR_LEVELS,
|
||||||
IGNORED_PROPERTY_TYPES,
|
IGNORED_PROPERTY_TYPES,
|
||||||
|
|
@ -159,6 +160,9 @@ class EPCDataProcessor:
|
||||||
# colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
# colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
||||||
# )
|
# )
|
||||||
|
|
||||||
|
# Create post sap10 flag
|
||||||
|
self.create_post_sap10_flag()
|
||||||
|
|
||||||
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
|
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
|
||||||
cleaning_averages = self.cleaning_averages.copy()
|
cleaning_averages = self.cleaning_averages.copy()
|
||||||
if self.run_mode == "newdata":
|
if self.run_mode == "newdata":
|
||||||
|
|
@ -175,6 +179,13 @@ class EPCDataProcessor:
|
||||||
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
|
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
|
||||||
self.cast_data_columns_to_lower()
|
self.cast_data_columns_to_lower()
|
||||||
|
|
||||||
|
def create_post_sap10_flag(self):
|
||||||
|
"""
|
||||||
|
Create a flag to indicate if the epc is post sap10
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.data["is_post_sap10"] = self.data["LODGEMENT_DATE"] >= POST_SAP10_DATE
|
||||||
|
|
||||||
def cast_data_columns_to_lower(self):
|
def cast_data_columns_to_lower(self):
|
||||||
"""
|
"""
|
||||||
Convert all columns names to lower
|
Convert all columns names to lower
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ from etl.epc.settings import (
|
||||||
POTENTIAL_COLUMNS,
|
POTENTIAL_COLUMNS,
|
||||||
ROOM_FEATURES,
|
ROOM_FEATURES,
|
||||||
COST_FEATURES,
|
COST_FEATURES,
|
||||||
|
POST_SAP10_FEATURE,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: change in setting file
|
# TODO: change in setting file
|
||||||
|
|
@ -325,7 +326,9 @@ class EPCPipeline:
|
||||||
|
|
||||||
# We include the lodgement date here as we probably need to factor time into the
|
# We include the lodgement date here as we probably need to factor time into the
|
||||||
# model, since EPC standards and rigour have changed over time
|
# model, since EPC standards and rigour have changed over time
|
||||||
variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES]
|
variable_data = property_data[
|
||||||
|
VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE
|
||||||
|
]
|
||||||
|
|
||||||
uprn = str(uprn)
|
uprn = str(uprn)
|
||||||
epc_records = [
|
epc_records = [
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ from etl.epc.settings import (
|
||||||
COMPONENT_FEATURES,
|
COMPONENT_FEATURES,
|
||||||
EFFICIENCY_FEATURES,
|
EFFICIENCY_FEATURES,
|
||||||
ROOM_FEATURES,
|
ROOM_FEATURES,
|
||||||
|
POST_SAP10_FEATURE,
|
||||||
)
|
)
|
||||||
from recommendations.recommendation_utils import estimate_number_of_floors
|
from recommendations.recommendation_utils import estimate_number_of_floors
|
||||||
from utils.s3 import read_dataframe_from_s3_parquet
|
from utils.s3 import read_dataframe_from_s3_parquet
|
||||||
|
|
@ -89,6 +90,7 @@ class EPCRecord:
|
||||||
co2_emissions_current: float = None
|
co2_emissions_current: float = None
|
||||||
number_habitable_rooms: float = None
|
number_habitable_rooms: float = None
|
||||||
number_heated_rooms: float = None
|
number_heated_rooms: float = None
|
||||||
|
is_post_sap10: bool = None
|
||||||
|
|
||||||
# u_values_walls = None
|
# u_values_walls = None
|
||||||
# u_values_roof = None
|
# u_values_roof = None
|
||||||
|
|
@ -277,6 +279,7 @@ class EPCRecord:
|
||||||
self.number_heated_rooms: float = float(
|
self.number_heated_rooms: float = float(
|
||||||
self.prepared_epc["number_heated_rooms"]
|
self.prepared_epc["number_heated_rooms"]
|
||||||
)
|
)
|
||||||
|
self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"])
|
||||||
|
|
||||||
def _identify_delta_between_prepared_and_original_records(self):
|
def _identify_delta_between_prepared_and_original_records(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -385,11 +388,11 @@ class EPCRecord:
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def _clean_floor_height(self):
|
def _clean_floor_height(self):
|
||||||
""" Remaps anomalies in floor height to the average floor height for the property type """
|
"""Remaps anomalies in floor height to the average floor height for the property type"""
|
||||||
floor_height_data = self.cleaning_data[
|
floor_height_data = self.cleaning_data[
|
||||||
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) &
|
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"])
|
||||||
(self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
|
& (self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
|
||||||
]
|
]
|
||||||
average = floor_height_data["floor_height"].mean()
|
average = floor_height_data["floor_height"].mean()
|
||||||
sd = floor_height_data["floor_height"].std()
|
sd = floor_height_data["floor_height"].std()
|
||||||
# If we're in the top 0.5 percentile of floor heights, we'll set it to the average
|
# If we're in the top 0.5 percentile of floor heights, we'll set it to the average
|
||||||
|
|
@ -399,14 +402,16 @@ class EPCRecord:
|
||||||
self.prepared_epc["floor-height"] = average
|
self.prepared_epc["floor-height"] = average
|
||||||
|
|
||||||
def _clean_new_build_descriptions(self):
|
def _clean_new_build_descriptions(self):
|
||||||
for col in ['roof-description', 'walls-description', 'floor-description']:
|
for col in ["roof-description", "walls-description", "floor-description"]:
|
||||||
self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K")
|
self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K")
|
||||||
|
|
||||||
def _clean_constituency(self):
|
def _clean_constituency(self):
|
||||||
"""
|
"""
|
||||||
We handle the single case of finding a missing constituency by using the local authority
|
We handle the single case of finding a missing constituency by using the local authority
|
||||||
"""
|
"""
|
||||||
if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""):
|
if pd.isnull(self.prepared_epc["constituency"]) or (
|
||||||
|
self.prepared_epc["constituency"] == ""
|
||||||
|
):
|
||||||
if self.prepared_epc["local-authority"] != "E06000044":
|
if self.prepared_epc["local-authority"] != "E06000044":
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"This function is only implemented for Portsmouth, in the single edgecase seen"
|
"This function is only implemented for Portsmouth, in the single edgecase seen"
|
||||||
|
|
@ -595,12 +600,12 @@ class EPCRecord:
|
||||||
|
|
||||||
# We handle the edge case of floor area being 0. We set it to zero and it is cleaned by
|
# We handle the edge case of floor area being 0. We set it to zero and it is cleaned by
|
||||||
# _clean_with_data_processor
|
# _clean_with_data_processor
|
||||||
if self.prepared_epc['total-floor-area'] == 0:
|
if self.prepared_epc["total-floor-area"] == 0:
|
||||||
print(
|
print(
|
||||||
"Edge case of floor area being zero - will set to none and will be cleaned in "
|
"Edge case of floor area being zero - will set to none and will be cleaned in "
|
||||||
"_clean_with_data_processor"
|
"_clean_with_data_processor"
|
||||||
)
|
)
|
||||||
self.prepared_epc['total-floor-area'] = None
|
self.prepared_epc["total-floor-area"] = None
|
||||||
|
|
||||||
def _clean_mains_gas(self):
|
def _clean_mains_gas(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -609,12 +614,7 @@ class EPCRecord:
|
||||||
if not self.prepared_epc:
|
if not self.prepared_epc:
|
||||||
raise ValueError("EPC Recrod doesn not contain epc data")
|
raise ValueError("EPC Recrod doesn not contain epc data")
|
||||||
|
|
||||||
mains_gas_map = {
|
mains_gas_map = {"Y": True, "N": False, True: True, False: False}
|
||||||
"Y": True,
|
|
||||||
"N": False,
|
|
||||||
True: True,
|
|
||||||
False: False
|
|
||||||
}
|
|
||||||
|
|
||||||
self.prepared_epc["mains-gas-flag"] = (
|
self.prepared_epc["mains-gas-flag"] = (
|
||||||
None
|
None
|
||||||
|
|
@ -1064,7 +1064,12 @@ class EPCDifferenceRecord:
|
||||||
CARBON_RESPONSE
|
CARBON_RESPONSE
|
||||||
)
|
)
|
||||||
|
|
||||||
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES
|
component_variables = (
|
||||||
|
COMPONENT_FEATURES
|
||||||
|
+ EFFICIENCY_FEATURES
|
||||||
|
+ ROOM_FEATURES
|
||||||
|
+ POST_SAP10_FEATURE
|
||||||
|
)
|
||||||
ending_record = self.record2.get(
|
ending_record = self.record2.get(
|
||||||
component_variables + ["lodgement_date"],
|
component_variables + ["lodgement_date"],
|
||||||
return_asdict=True,
|
return_asdict=True,
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,9 @@ DATA_ANOMALY_MATCHES = {
|
||||||
"Unknown",
|
"Unknown",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Add the post_sap10 date to indicate if the epc is post sap10
|
||||||
|
POST_SAP10_DATE = "2025-06-22"
|
||||||
|
|
||||||
DATA_ANOMALY_SUBSTRINGS = {
|
DATA_ANOMALY_SUBSTRINGS = {
|
||||||
# Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for
|
# Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for
|
||||||
# ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained
|
# ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained
|
||||||
|
|
@ -184,6 +187,8 @@ EFFICIENCY_FEATURES = [
|
||||||
|
|
||||||
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
|
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
|
||||||
|
|
||||||
|
POST_SAP10_FEATURE = ["is_post_sap10"]
|
||||||
|
|
||||||
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
|
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
|
||||||
"TRANSACTION_TYPE",
|
"TRANSACTION_TYPE",
|
||||||
"ENERGY_TARIFF", # Not sure if this is relevant
|
"ENERGY_TARIFF", # Not sure if this is relevant
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue