fixed but which is dropping roof description

2026-07-27 23:35:01 +00:00 · 2023-09-17 22:20:23 +01:00 · 2023-09-17 22:20:23 +01:00 · 10fc349114
commit 10fc349114
parent b71e76449f
3 changed files with 20 additions and 20 deletions
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -414,6 +414,7 @@ class DataProcessor:
        # We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them
        self.data = self.data[~pd.isnull(self.data["WINDOWS_DESCRIPTION"])]
        self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])]
+        self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])]

    def clean_multi_glaze_proportion(self) -> None:
        """
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -211,4 +211,5 @@ fill_na_map = {
    "LOW_ENERGY_LIGHTING": 0,
    "MAINHEATCONT_DESCRIPTION": "Unknown",
    "EXTENSION_COUNT": 0,
+    "NUMBER_OPEN_FIREPLACES": 0
 }
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -81,11 +81,11 @@ def process_and_prune_desriptions(df, cleaned_lookup):
            'no_data_ENDING',
        ],
        "roof": [
-            'original_description', 'clean_description', 'thermal_transmittance',
+            'original_description', 'thermal_transmittance',
            'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
            'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
            'has_dwelling_above', 'is_valid', 'insulation_thickness',
-            'original_description_ENDING', 'clean_description_ENDING',
+            'original_description_ENDING',
            'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
            'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
            'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
@ -180,13 +180,25 @@ def app():
        df = data_processor.pre_process()
        cleaning_averages = data_processor.make_cleaning_averages()

+        # We have some odd cases with missing constituency so we fill
+        df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
+
+        df = DataProcessor.apply_averages_cleaning(
+            data_to_clean=df,
+            cleaning_data=cleaning_averages,
+            cols_to_merge_on=COLUMNS_TO_MERGE_ON
+        )
+
        data_by_urpn = []
        for uprn, property_data in df.groupby("UPRN", observed=True):
+
            # Fixed features - these are property attributes that shouldn't change over time
            fixed_data = {}

            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
-            if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1):
+            if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1) or (
+                pd.isnull(property_data[MANDATORY_FIXED_FEATURES]).sum().sum() > 0
+            ):
                continue

            # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
@ -195,36 +207,22 @@ def app():
                property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
            )

-            # Extract the columns that are not all None
-            modified_property_data = DataProcessor.apply_averages_cleaning(
-                data_to_clean=property_data,
-                cleaning_data=cleaning_averages,
-                cols_to_merge_on=COLUMNS_TO_MERGE_ON
-            )
-
            # Combine all fields together
            fixed_data.update(mandatory_field_data)
            fixed_data.update(latest_field_data)

-            # Apply cleaning to fixed_data
-            fixed_data = DataProcessor.apply_averages_cleaning(
-                data_to_clean=pd.DataFrame([fixed_data]),
-                cleaning_data=cleaning_averages,
-                cols_to_merge_on=COLUMNS_TO_MERGE_ON
-            ).to_dict("records")[0]
-
            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
-            variable_data = modified_property_data[
+            variable_data = property_data[
                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
            # e.g. first vs second, second vs third and also first vs third
            property_model_data = []
-            for idx in range(0, modified_property_data.shape[0] - 1):
+            for idx in range(0, property_data.shape[0] - 1):

-                if idx >= modified_property_data.shape[0] - 1:
+                if idx >= property_data.shape[0] - 1:
                    break

                earliest_record = variable_data.iloc[idx]