mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
fix pipeline for July 2025 data, keep lodgement date to do analysis on new rdsap standard in ML stage
This commit is contained in:
parent
6c6a44abfe
commit
723beaf104
5 changed files with 775 additions and 161 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -275,4 +275,6 @@ cache/
|
|||
*/.idea
|
||||
|
||||
*.png
|
||||
*.pptx
|
||||
*.pptx
|
||||
|
||||
local_data*
|
||||
|
|
@ -30,24 +30,25 @@ class Definitions:
|
|||
# The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER
|
||||
# was only lodged on the register from 7 March 2010.
|
||||
"Blank"
|
||||
# There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to
|
||||
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
|
||||
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
|
||||
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
|
||||
# etc). These records are being published for completeness. An ongoing process to manage these manually added
|
||||
# There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to
|
||||
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
|
||||
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
|
||||
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
|
||||
# etc). These records are being published for completeness. An ongoing process to manage these manually added
|
||||
# addresses will take time to develop to deal with these and future anomalies.
|
||||
#
|
||||
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
|
||||
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
|
||||
# the description fields for floor, roof and wall. For the purposes of this data release only the information
|
||||
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
|
||||
# value in this first field it means that sometimes the first field in a multiple entry description field may
|
||||
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
|
||||
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
|
||||
# the description fields for floor, roof and wall. For the purposes of this data release only the information
|
||||
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
|
||||
# value in this first field it means that sometimes the first field in a multiple entry description field may
|
||||
# contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
|
||||
"NULL",
|
||||
# We sometimes see fields populated with just an empty string.
|
||||
"",
|
||||
# An older value which rarely shows up but has been seen in the data.
|
||||
"UNKNOWN",
|
||||
"Unknown",
|
||||
}
|
||||
|
||||
DATA_ANOMALY_SUBSTRINGS = {
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ construction_age_bounds_map = {
|
|||
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
||||
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
||||
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
||||
"England and Wales: 2012-2021": {"l": 2012, "u": 2021},
|
||||
"England and Wales: 2022 onwards": {"l": 2022, "u": 3000},
|
||||
}
|
||||
|
||||
construction_age_remap = {
|
||||
|
|
@ -384,7 +386,7 @@ class EPCDataProcessor:
|
|||
has_missings = pd.isnull(self.data[col]).sum()
|
||||
while has_missings:
|
||||
self.data = apply_clean(
|
||||
data=self.data, matching_columns=matching_columns[0: to_index + 1]
|
||||
data=self.data, matching_columns=matching_columns[0 : to_index + 1]
|
||||
)
|
||||
has_missings = pd.isnull(self.data[col]).sum()
|
||||
|
||||
|
|
@ -858,7 +860,9 @@ class EPCDataProcessor:
|
|||
|
||||
# Fill NaN values with averages
|
||||
for col in cols_to_clean:
|
||||
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"])
|
||||
data_to_clean[col] = data_to_clean[col].fillna(
|
||||
data_to_clean[f"{col}_AVERAGE"]
|
||||
)
|
||||
data_to_clean = data_to_clean.drop(columns=[f"{col}_AVERAGE"])
|
||||
# If we still have missings
|
||||
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[col].mean())
|
||||
|
|
|
|||
|
|
@ -8,7 +8,9 @@ from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
|||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatControlAttributes import (
|
||||
MainheatControlAttributes,
|
||||
)
|
||||
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
|
||||
|
|
@ -169,7 +171,7 @@ class TrainingDataset(BaseDataset):
|
|||
self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
|
||||
|
||||
self._feature_generation()
|
||||
self._drop_features()
|
||||
# self._drop_features()
|
||||
self._clean_efficiency_variables()
|
||||
self._null_validation(information="Clean Efficiency Variables")
|
||||
self._expand_description_to_features(cleaned_lookup)
|
||||
|
|
@ -210,11 +212,11 @@ class TrainingDataset(BaseDataset):
|
|||
common_cols = [[col + "_starting", col + "_ending"] for col in common_cols]
|
||||
|
||||
self.df = self.df.loc[
|
||||
:,
|
||||
no_suffix_cols
|
||||
+ only_ending_cols
|
||||
+ [col for cols in common_cols for col in cols],
|
||||
]
|
||||
:,
|
||||
no_suffix_cols
|
||||
+ only_ending_cols
|
||||
+ [col for cols in common_cols for col in cols],
|
||||
]
|
||||
|
||||
def _remove_abnormal_change_in_floor_area(self):
|
||||
"""
|
||||
|
|
@ -519,7 +521,7 @@ class TrainingDataset(BaseDataset):
|
|||
expanded_df["is_sandstone_or_limestone"]
|
||||
== expanded_df["is_sandstone_or_limestone_ending"]
|
||||
)
|
||||
]
|
||||
]
|
||||
elif component == "floor":
|
||||
expanded_df = expanded_df[
|
||||
(expanded_df["is_suspended"] == expanded_df["is_suspended_ending"])
|
||||
|
|
@ -536,7 +538,7 @@ class TrainingDataset(BaseDataset):
|
|||
expanded_df["is_to_external_air"]
|
||||
== expanded_df["is_to_external_air_ending"]
|
||||
)
|
||||
]
|
||||
]
|
||||
elif component == "roof":
|
||||
expanded_df = expanded_df[
|
||||
(expanded_df["is_pitched"] == expanded_df["is_pitched_ending"])
|
||||
|
|
@ -549,7 +551,7 @@ class TrainingDataset(BaseDataset):
|
|||
expanded_df["has_dwelling_above"]
|
||||
== expanded_df["has_dwelling_above_ending"]
|
||||
)
|
||||
]
|
||||
]
|
||||
|
||||
return expanded_df
|
||||
|
||||
|
|
@ -695,10 +697,14 @@ class TrainingDataset(BaseDataset):
|
|||
cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key])
|
||||
|
||||
# We handle a specific edge case where we're missing information for the original description
|
||||
descriptions = [x for x in self.df[left_on_starting].unique() if pd.notnull(x)]
|
||||
descriptions = [
|
||||
x for x in self.df[left_on_starting].unique() if pd.notnull(x)
|
||||
]
|
||||
# take any not in the cleaned lookup
|
||||
missing_descriptions = [
|
||||
x for x in descriptions if x not in cleaned_lookup_df_for_key["original_description"].values
|
||||
x
|
||||
for x in descriptions
|
||||
if x not in cleaned_lookup_df_for_key["original_description"].values
|
||||
]
|
||||
if missing_descriptions:
|
||||
# We handle them here
|
||||
|
|
@ -710,9 +716,12 @@ class TrainingDataset(BaseDataset):
|
|||
cleaned_data.append(
|
||||
{
|
||||
"original_description": x,
|
||||
"clean_description": desc_cleaner.description.replace("(assumed)",
|
||||
"").rstrip().capitalize(),
|
||||
**cleaned
|
||||
"clean_description": desc_cleaner.description.replace(
|
||||
"(assumed)", ""
|
||||
)
|
||||
.rstrip()
|
||||
.capitalize(),
|
||||
**cleaned,
|
||||
}
|
||||
)
|
||||
cleaned_lookup_df_for_key = pd.concat(
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue