fix pipeline for July 2025 data, keep lodgement date to do analysis on new rdsap standard in ML stage

This commit is contained in:
Michael Duong 2025-08-26 10:44:31 +01:00
parent 6c6a44abfe
commit 723beaf104
5 changed files with 775 additions and 161 deletions

4
.gitignore vendored
View file

@ -275,4 +275,6 @@ cache/
*/.idea
*.png
*.pptx
*.pptx
local_data*

View file

@ -30,24 +30,25 @@ class Definitions:
# The Building Emission Rate (BER) data field for non-domestic buildings may contain a blank value. The BER
# was only lodged on the register from 7 March 2010.
"Blank"
# There are currently just over 8,600 records where the local authority identifier is null. This is due to
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
# etc). These records are being published for completeness. An ongoing process to manage these manually added
# There are currently just over 8,600 records where the local authority identifier is null. This is due to
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
# etc). These records are being published for completeness. An ongoing process to manage these manually added
# addresses will take time to develop to deal with these and future anomalies.
#
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
# the description fields for floor, roof and wall. For the purposes of this data release only the information
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
# value in this first field it means that sometimes the first field in a multiple entry description field may
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
# the description fields for floor, roof and wall. For the purposes of this data release only the information
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
# value in this first field it means that sometimes the first field in a multiple entry description field may
# contain a null value. A resolution to correct these anomalies will be considered for future data releases.
"NULL",
# We sometimes see fields populated with just an empty string.
"",
# An older value which rarely shows up but has been seen in the data.
"UNKNOWN",
"Unknown",
}
DATA_ANOMALY_SUBSTRINGS = {

View file

@ -48,6 +48,8 @@ construction_age_bounds_map = {
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
"England and Wales: 2012-2021": {"l": 2012, "u": 2021},
"England and Wales: 2022 onwards": {"l": 2022, "u": 3000},
}
construction_age_remap = {
@ -384,7 +386,7 @@ class EPCDataProcessor:
has_missings = pd.isnull(self.data[col]).sum()
while has_missings:
self.data = apply_clean(
data=self.data, matching_columns=matching_columns[0: to_index + 1]
data=self.data, matching_columns=matching_columns[0 : to_index + 1]
)
has_missings = pd.isnull(self.data[col]).sum()
@ -858,7 +860,9 @@ class EPCDataProcessor:
# Fill NaN values with averages
for col in cols_to_clean:
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"])
data_to_clean[col] = data_to_clean[col].fillna(
data_to_clean[f"{col}_AVERAGE"]
)
data_to_clean = data_to_clean.drop(columns=[f"{col}_AVERAGE"])
# If we still have missings
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[col].mean())

View file

@ -8,7 +8,9 @@ from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import (
MainheatControlAttributes,
)
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
@ -169,7 +171,7 @@ class TrainingDataset(BaseDataset):
self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
self._feature_generation()
self._drop_features()
# self._drop_features()
self._clean_efficiency_variables()
self._null_validation(information="Clean Efficiency Variables")
self._expand_description_to_features(cleaned_lookup)
@ -210,11 +212,11 @@ class TrainingDataset(BaseDataset):
common_cols = [[col + "_starting", col + "_ending"] for col in common_cols]
self.df = self.df.loc[
:,
no_suffix_cols
+ only_ending_cols
+ [col for cols in common_cols for col in cols],
]
:,
no_suffix_cols
+ only_ending_cols
+ [col for cols in common_cols for col in cols],
]
def _remove_abnormal_change_in_floor_area(self):
"""
@ -519,7 +521,7 @@ class TrainingDataset(BaseDataset):
expanded_df["is_sandstone_or_limestone"]
== expanded_df["is_sandstone_or_limestone_ending"]
)
]
]
elif component == "floor":
expanded_df = expanded_df[
(expanded_df["is_suspended"] == expanded_df["is_suspended_ending"])
@ -536,7 +538,7 @@ class TrainingDataset(BaseDataset):
expanded_df["is_to_external_air"]
== expanded_df["is_to_external_air_ending"]
)
]
]
elif component == "roof":
expanded_df = expanded_df[
(expanded_df["is_pitched"] == expanded_df["is_pitched_ending"])
@ -549,7 +551,7 @@ class TrainingDataset(BaseDataset):
expanded_df["has_dwelling_above"]
== expanded_df["has_dwelling_above_ending"]
)
]
]
return expanded_df
@ -695,10 +697,14 @@ class TrainingDataset(BaseDataset):
cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key])
# We handle a specific edge case where we're missing information for the original description
descriptions = [x for x in self.df[left_on_starting].unique() if pd.notnull(x)]
descriptions = [
x for x in self.df[left_on_starting].unique() if pd.notnull(x)
]
# take any not in the cleaned lookup
missing_descriptions = [
x for x in descriptions if x not in cleaned_lookup_df_for_key["original_description"].values
x
for x in descriptions
if x not in cleaned_lookup_df_for_key["original_description"].values
]
if missing_descriptions:
# We handle them here
@ -710,9 +716,12 @@ class TrainingDataset(BaseDataset):
cleaned_data.append(
{
"original_description": x,
"clean_description": desc_cleaner.description.replace("(assumed)",
"").rstrip().capitalize(),
**cleaned
"clean_description": desc_cleaner.description.replace(
"(assumed)", ""
)
.rstrip()
.capitalize(),
**cleaned,
}
)
cleaned_lookup_df_for_key = pd.concat(

File diff suppressed because it is too large Load diff