Investigating prospective changes to rdsap data

2026-07-27 23:35:01 +00:00 · 2023-09-06 16:47:18 +01:00 · 2023-09-06 16:47:18 +01:00 · e5f4e96f00
commit e5f4e96f00
parent e05e8ff636
3 changed files with 206 additions and 10 deletions
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -33,19 +33,132 @@ class DataProcessor:
    def insert_data(self, data: pd.DataFrame) -> None:
        self.data = data

+    def standardise_construction_age_band(self):
+        """
+        This function will tidy up some of the non-standard values that are populated in the construction age
+        band, which is useful for cleaning
+        """
+        bounds_map = {
+            "England and Wales: before 1900": {"l": 0, "u": 1899},
+            "England and Wales: 1930-1949": {"l": 1930, "u": 1949},
+            "England and Wales: 1900-1929": {"l": 1900, "u": 1929},
+            "England and Wales: 1950-1966": {"l": 1950, "u": 1966},
+            "England and Wales: 1967-1975": {"l": 1967, "u": 1975},
+            "England and Wales: 1976-1982": {"l": 1976, "u": 1982},
+            "England and Wales: 1983-1990": {"l": 1983, "u": 1990},
+            "England and Wales: 1991-1995": {"l": 1991, "u": 1995},
+            "England and Wales: 1996-2002": {"l": 1996, "u": 2002},
+            "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
+            "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
+            "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
+        }
+
+        remap = {
+            "England and Wales: 2007 onwards": "England and Wales: 2007-2011"
+        }
+
+        expanded_map = {
+            i: [
+                label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
+            ][0] for i in range(0, 3001)
+        }
+
+        def is_int(x):
+            try:
+                int(x)
+                return True
+            except:
+                return False
+
+        def clean_construction_age_band(x):
+            # Firstly, we check if it's an error value
+            if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
+                return x
+
+            # Next, we check if it's a value in our map
+            if bounds_map.get(x):
+                return x
+
+            # We check if it's a standard remap value
+            remap_value = remap.get(x, None)
+            if remap_value:
+                return remap_value
+
+            # We check if it's a number
+            if is_int(x):
+                x_int = int(x)
+                return expanded_map[x_int]
+
+            raise NotImplementedError("Not handled the case for value %s" % x)
+
+        self.data["CONSTRUCTION_AGE_BAND_CLEANED"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
+            lambda x: clean_construction_age_band(x)
+        )
+
+    def clean_missing_rooms(self):
+        """
+        For the number of heated rooms and number of habitable rooms, we clean these values up front,
+        based on property archetype and age
+
+        TODO: We could use a model based impution approach for possibly more accurate cleaning
+        """
+
+        self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
+
+        def apply_clean(data, matching_columns):
+
+            cleaning_data = data[~pd.isnull(data[col])].groupby(
+                matching_columns
+            )[col].median().reset_index()
+
+            data = data.merge(
+                cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING")
+            )
+
+            data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col])
+            data = data.drop(columns=f"{col}_CLEANING")
+            return data
+
+        for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
+
+            to_index = 3
+            matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND_CLEANED", "POSTAL_AREA"]
+            has_missings = pd.isnull(self.data[col]).sum()
+            while has_missings:
+                self.data = apply_clean(
+                    data=self.data,
+                    matching_columns=matching_columns[0:to_index + 1]
+                )
+                has_missings = pd.isnull(self.data[col]).sum()
+
+                if not has_missings or to_index == 0:
+                    # Check if we've gotten to index 0 and still have missings - something has gone wrong or
+                    # we have a very unique property type
+                    if has_missings:
+                        raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col)
+
+                    break
+                to_index -= 1
+
    def pre_process(self) -> pd.DataFrame:
        """
        Load data and begin initial cleaning
        """
        if not self.data:
            self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
+
        self.confine_data()

-        # TODO: CLean number of heated rooms and habitable rooms
+        # We have some non-standard construction age bands which we'll clean for matching
+        self.standardise_construction_age_band()
+        self.clean_missing_rooms()
+
        self.recast_df_columns(
            column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
        )
        self.clean_multi_glaze_proportion()
+        self.clean_photo_supply()
+
        self.retain_multiple_epc_properties(
            epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
        )
@ -235,8 +348,7 @@ class DataProcessor:

        for key, values in column_mappings.items():
            if key not in self.data.columns:
-                print("Column mapping incorrectly specified")
-                exit(1)
+                raise ValueError("Column mapping incorrectly specified")
            for value in values:
                self.data[key] = self.data[key].astype(value)

@ -272,6 +384,13 @@ class DataProcessor:
        ) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
        self.data.loc[no_multi_glaze_proportion_index, "MULTI_GLAZE_PROPORTION"] = 100

+    def clean_photo_supply(self) -> None:
+        """
+        We fill photo supply with zeros where it's missing
+        """
+
+        self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0)
+
    @staticmethod
    def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on):
        """
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -53,6 +53,12 @@ DEPLOYMENT_FOLDER = "deployment"
 TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45

+# If we have multiple records for a numerical field, such as floor area,
+# we check the margine for error between the biggest and lowest values. If we see large
+# swings in measured values, we take the most recent value for this field as we interpret this
+# as inaccurate measurements in the past and use the most recent value
+MULTIPLE_VALUES_MARGIN_FOR_ERROR = 0.1
+
 COLUMNS_TO_MERGE_ON = [
    "PROPERTY_TYPE",
    "BUILT_FORM",
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -11,6 +11,7 @@ from simulation_system.core.Settings import (
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
+    MULTIPLE_VALUES_MARGIN_FOR_ERROR,
 )
 from simulation_system.core.DataProcessor import DataProcessor
 from utils import save_dataframe_to_s3_parquet
@ -32,10 +33,60 @@ def app():

    dataset = []
    cleaning_dataset = []
-    # 116
-    # 128048706
-    # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic
-    # -certificates/domestic-E09000021-Kingston-upon-Thames')
+
+    # TODO: Does energy tariff make a difference
+    # TODO: If SAP hasn't changed, we don't include the record
+    # TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
+    # TODO: Same as floor area for floor height
+    # TODO: If fundamental building fabric changes, we should proabably discard the record
+    # TODO: Should we prune records that have an exceptionally large amount of time between them?
+    # TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
+    #
+    # TODO: REMOVE ME
+    dodgy_uprns = []
+    observed_uprns = [
+        "10002082244",  # Doesn't really make sense, house no longer has lel and not has more insulation but lower score
+        "10002082259",
+        # Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the
+        # floor assessment is now assumed whereas before it wasnt
+        "10002082418",  # Walls went from insulated to not...
+        "10002082640",  # Property identical besides different energy taffiff
+        "10002082830",  # Lots of records going from not insulated to insulated but some parts of
+        # the property has gotten better
+        "10002083244",  # latest epc indicates the property is worse
+        "10002083592",  # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the
+        # floor type has changed from solid to syspended. lel has decreased
+        "100030533576",  # property slightly worse, has less lels and the floor description has changed type
+        "100030533668",  # has slightly less lels. Glazed type is now missing
+        "100030533803",  # Not super clea why this is lower, newer epc has more lel but is using second heating
+        "100030534016",  # Property has less lel but more roof insulation. Floor type has changed
+        "100030534040",  # property has less lel and the floor type has changed
+        "100030534041",  # property has less insulation and less lel
+        "100030534243",  # Cavity wall has gone from filled to unfilled
+        "100030534294",  # less roof insulation but now has an air source heat pump
+        "100030534322",  # identical between records but now with higher lel but no change recorded
+        "100030534413",  # identical between records but different energy tariff, no sap change
+        "100030534437",  # property has less lel and the mainheating no longer has a programmer and trvs
+        "100030534569",  # Cavity wall no longer filled, 30mm more roof insulation in newest epc
+        "100030534676",  # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but
+        # the wall cavity is no longer filled
+        "100030534732",  # property has higher lel %. Not clear why this is worse, glazing type has changed.
+        # This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to
+        # the later epc
+        "100030534791",  # Property has started using secondary heating - the EPCs are taken on the same day so maybe we
+        # should discard
+        "100030534795",  # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66
+        # The newer epc indicates the property now has 40% photo supply so this doesn't make much sense
+        "100030534897",  # Roof has gone from thatched with additional insulation to pitched with insulation,
+        # sap score hasn't changed
+        "100030534986",  # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and
+        # slightly better main heating setup
+        "100030535043",  # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and
+        # wall height
+        "100030535173",  # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation
+        "100030535244",  # lel gone from 100% to 0%, sap is the same
+    ]
+
    for directory in tqdm(directories):

        filepath = directory / "certificates.csv"
@ -74,9 +125,9 @@ def app():

                vals = list(modified_property_data[field].dropna().unique())
                if len(vals) > 1:
-                    # Check the values are too far apart
-                    # TODO: we could have multiple values here, why only use the first two?
-                    if abs(vals[0] - vals[1]) / vals[0] > 0.1:
+                    lowest_value = min(vals)
+                    largest_value = max(vals)
+                    if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR:
                        # Take the more recent value since it's likely to be more accurate
                        vals = [vals[-1]]

@ -111,6 +162,26 @@ def app():
                    - starting_record[HEAT_DEMAND_RESPONSE]
                )

+                # Check for a change in the starting and ending record
+                check_cols = [
+                    col for col in starting_record.index if col not in [
+                        "LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF"
+                    ]
+                ]
+                all_same = True
+                for col in check_cols:
+                    if starting_record[col] != ending_record[col]:
+                        all_same = False
+                        break
+
+                if rdsap_change <= 0:
+                    if all_same | (uprn in observed_uprns):
+                        if uprn not in observed_uprns:
+                            dodgy_uprns.append(uprn)
+                    else:
+                        compare = pd.concat([starting_record, ending_record], axis=1)
+                        bljd
+
                # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
                #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
                #       within descriptions