fix pipeline for July 2025 data, keep lodgement date to do analysis on new rdsap standard in ML stage

2026-07-27 23:35:01 +00:00 · 2025-08-26 10:44:31 +01:00 · 2025-08-26 10:44:31 +01:00 · 723beaf104
commit 723beaf104
parent 6c6a44abfe
5 changed files with 775 additions and 161 deletions
--- a/.gitignore
+++ b/.gitignore
@ -275,4 +275,6 @@ cache/
 */.idea

 *.png
-*.pptx
+*.pptx
+
+local_data*
--- a/BaseUtility.py
+++ b/BaseUtility.py
@ -30,24 +30,25 @@ class Definitions:
        # The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER
        # was only lodged on the register from 7 March 2010.
        "Blank"
-        # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to 
-        # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) 
-        # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested 
-        # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, 
-        # etc). These records are being published for completeness. An ongoing process to manage these manually added 
+        # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to
+        # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
+        # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
+        # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
+        # etc). These records are being published for completeness. An ongoing process to manage these manually added
        # addresses will take time to develop to deal with these and future anomalies.
        #
-        # There are several fields within the lodged data where it is possible to enter multiple entries to cater for 
-        # different data_types of build within a single property, i.e. extensions. This results in multiple entries for 
-        # the description fields for floor, roof and wall. For the purposes of this data release only the information 
-        # contained within the first of these multiple entries is being provided. As there are no restrictions on the 
-        # value in this first field it means that sometimes the first field in a multiple entry description field may 
+        # There are several fields within the lodged data where it is possible to enter multiple entries to cater for
+        # different data_types of build within a single property, i.e. extensions. This results in multiple entries for
+        # the description fields for floor, roof and wall. For the purposes of this data release only the information
+        # contained within the first of these multiple entries is being provided. As there are no restrictions on the
+        # value in this first field it means that sometimes the first field in a multiple entry description field may
        # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
        "NULL",
        # We sometimes see fields populated with just an empty string.
        "",
        # An older value which rarely shows up but has been seen in the data.
        "UNKNOWN",
+        "Unknown",
    }

    DATA_ANOMALY_SUBSTRINGS = {
--- a/etl/epc/DataProcessor.py
+++ b/etl/epc/DataProcessor.py
@ -48,6 +48,8 @@ construction_age_bounds_map = {
    "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
    "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
    "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
+    "England and Wales: 2012-2021": {"l": 2012, "u": 2021},
+    "England and Wales: 2022 onwards": {"l": 2022, "u": 3000},
 }

 construction_age_remap = {
@ -384,7 +386,7 @@ class EPCDataProcessor:
            has_missings = pd.isnull(self.data[col]).sum()
            while has_missings:
                self.data = apply_clean(
-                    data=self.data, matching_columns=matching_columns[0: to_index + 1]
+                    data=self.data, matching_columns=matching_columns[0 : to_index + 1]
                )
                has_missings = pd.isnull(self.data[col]).sum()

@ -858,7 +860,9 @@ class EPCDataProcessor:

        # Fill NaN values with averages
        for col in cols_to_clean:
-            data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"])
+            data_to_clean[col] = data_to_clean[col].fillna(
+                data_to_clean[f"{col}_AVERAGE"]
+            )
            data_to_clean = data_to_clean.drop(columns=[f"{col}_AVERAGE"])
            # If we still have missings
            data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[col].mean())
--- a/etl/epc/Dataset.py
+++ b/etl/epc/Dataset.py
@ -8,7 +8,9 @@ from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
 from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
 from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
 from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
-from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+from etl.epc_clean.epc_attributes.MainheatControlAttributes import (
+    MainheatControlAttributes,
+)
 from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
 from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes

@ -169,7 +171,7 @@ class TrainingDataset(BaseDataset):
        self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])

        self._feature_generation()
-        self._drop_features()
+        # self._drop_features()
        self._clean_efficiency_variables()
        self._null_validation(information="Clean Efficiency Variables")
        self._expand_description_to_features(cleaned_lookup)
@ -210,11 +212,11 @@ class TrainingDataset(BaseDataset):
        common_cols = [[col + "_starting", col + "_ending"] for col in common_cols]

        self.df = self.df.loc[
-                  :,
-                  no_suffix_cols
-                  + only_ending_cols
-                  + [col for cols in common_cols for col in cols],
-                  ]
+            :,
+            no_suffix_cols
+            + only_ending_cols
+            + [col for cols in common_cols for col in cols],
+        ]

    def _remove_abnormal_change_in_floor_area(self):
        """
@ -519,7 +521,7 @@ class TrainingDataset(BaseDataset):
                    expanded_df["is_sandstone_or_limestone"]
                    == expanded_df["is_sandstone_or_limestone_ending"]
                )
-                ]
+            ]
        elif component == "floor":
            expanded_df = expanded_df[
                (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"])
@ -536,7 +538,7 @@ class TrainingDataset(BaseDataset):
                    expanded_df["is_to_external_air"]
                    == expanded_df["is_to_external_air_ending"]
                )
-                ]
+            ]
        elif component == "roof":
            expanded_df = expanded_df[
                (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"])
@ -549,7 +551,7 @@ class TrainingDataset(BaseDataset):
                    expanded_df["has_dwelling_above"]
                    == expanded_df["has_dwelling_above_ending"]
                )
-                ]
+            ]

        return expanded_df

@ -695,10 +697,14 @@ class TrainingDataset(BaseDataset):
            cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key])

            # We handle a specific edge case where we're missing information for the original description
-            descriptions = [x for x in self.df[left_on_starting].unique() if pd.notnull(x)]
+            descriptions = [
+                x for x in self.df[left_on_starting].unique() if pd.notnull(x)
+            ]
            # take any not in the cleaned lookup
            missing_descriptions = [
-                x for x in descriptions if x not in cleaned_lookup_df_for_key["original_description"].values
+                x
+                for x in descriptions
+                if x not in cleaned_lookup_df_for_key["original_description"].values
            ]
            if missing_descriptions:
                # We handle them here
@ -710,9 +716,12 @@ class TrainingDataset(BaseDataset):
                    cleaned_data.append(
                        {
                            "original_description": x,
-                            "clean_description": desc_cleaner.description.replace("(assumed)",
-                                                                                  "").rstrip().capitalize(),
-                            **cleaned
+                            "clean_description": desc_cleaner.description.replace(
+                                "(assumed)", ""
+                            )
+                            .rstrip()
+                            .capitalize(),
+                            **cleaned,
                        }
                    )
                cleaned_lookup_df_for_key = pd.concat(
--- a/recommendations/rdsap_tables.py
+++ b/recommendations/rdsap_tables.py