Adding pruning and processing of descriptions

2026-07-27 23:35:01 +00:00 · 2023-09-14 16:23:05 +01:00 · 2023-09-14 16:23:05 +01:00 · e0019662c9
commit e0019662c9
parent c2fcd0041f
4 changed files with 166 additions and 16 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"

 RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
 HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
+CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"


 def ordinal(n):
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -1,5 +1,6 @@
 import pandas as pd
 from tqdm import tqdm
+import msgpack

 from pathlib import Path
 from model_data.simulation_system.core.Settings import (
@ -9,20 +10,42 @@ from model_data.simulation_system.core.Settings import (
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
-    EARLIEST_EPC_DATE
+    EARLIEST_EPC_DATE,
+    CARBON_RESPONSE,
 )
 from model_data.simulation_system.core.DataProcessor import DataProcessor
-from utils.s3 import save_dataframe_to_s3_parquet
+from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3

 DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"


+def get_cleaned():
+    """
+    This function will retrieve the cleaned dataset from s3 which has the cleaned
+    descriptions for the epc dataset
+
+    This data is stored in MessagePack format and therefore needs to be decoded
+    :return:
+    """
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    return cleaned
+
+
 def app():
    # Get all the files in the directory

    # Data glossary:
    # https://epc.opendatacommunities.org/docs/guidance#glossary

+    cleaned_lookup = get_cleaned()
+
    # List all subdirectories
    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

@ -84,7 +107,7 @@ def app():
            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
            variable_data = modified_property_data[
-                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
+                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
@ -104,24 +127,29 @@ def app():
                if gets_better:
                    starting_sap = earliest_record[RDSAP_RESPONSE]
                    starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
+                    starting_carbon = earliest_record[CARBON_RESPONSE]
+
                    rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
                    heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
-                else:
-                    starting_sap = latest_record[RDSAP_RESPONSE]
-                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
-                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
-                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+                    carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon

-                if rdsap_change == 0:
-                    continue
-
-                if gets_better:
                    starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                    ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
                else:
+                    starting_sap = latest_record[RDSAP_RESPONSE]
+                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
+                    starting_carbon = latest_record[CARBON_RESPONSE]
+
+                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
+                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+                    carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
+
                    starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                    ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")

+                if rdsap_change == 0:
+                    continue
+
                features = pd.concat([starting_record, ending_record])

                property_model_data.append(
@ -129,8 +157,10 @@ def app():
                        "UPRN": uprn,
                        "RDSAP_CHANGE": rdsap_change,
                        "HEAT_DEMAND_CHANGE": heat_demand_change,
-                        "STARTING_SAP": starting_sap,
-                        "STARTING_HEAT_DEMAND": starting_heat_demand,
+                        "CARBON_CHANGE": carbon_change,
+                        "SAP_STARTING": starting_sap,
+                        "HEAT_DEMAND_STARTING": starting_heat_demand,
+                        "CARBON_STARTING": starting_carbon,
                        **fixed_data,
                        **features.to_dict(),
                    }
@ -152,6 +182,125 @@ def app():
        #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
        #       within descriptions

+        # We look for key building fabric features that have changed from one EPC to the next.
+        # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
+        # remove this record, as it indicates that the quality of the EPC conducted in the first instance
+        # is low
+        # We also replace descriptions with their cleaned variants
+
+        def process_and_prune_desriptions(df, cleaned_lookup):
+
+            # TODO: In a future iteration, we can test using the binary features and the insulation thickness
+            #       estimates, we well as estimated U-values
+
+            cols_to_drop = {
+                "walls": [
+                    'original_description', 'thermal_transmittance',
+                    'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
+                    'is_solid_brick', 'is_system_built', 'is_timber_frame',
+                    'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
+                    'is_sandstone_or_limestone', 'insulation_thickness',
+                    'external_insulation', 'internal_insulation',
+                    'original_description_ENDING',
+                    'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+                    'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
+                    'is_solid_brick_ENDING', 'is_system_built_ENDING',
+                    'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
+                    'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
+                    'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
+                    'external_insulation_ENDING', 'internal_insulation_ENDING',
+                ],
+                "floor": [
+                    'original_description', 'thermal_transmittance',
+                    'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
+                    'is_to_external_air', 'is_suspended', 'is_solid',
+                    'another_property_below', 'insulation_thickness', 'no_data',
+                    'original_description_ENDING',
+                    'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+                    'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
+                    'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
+                    'another_property_below_ENDING', 'insulation_thickness_ENDING',
+                    'no_data_ENDING',
+                ],
+                "roof": [
+                    'original_description', 'clean_description', 'thermal_transmittance',
+                    'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
+                    'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
+                    'has_dwelling_above', 'is_valid', 'insulation_thickness',
+                    'original_description_ENDING', 'clean_description_ENDING',
+                    'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+                    'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
+                    'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
+                    'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
+                    'insulation_thickness_ENDING',
+                ]
+
+            }
+
+            for component in ["walls", "floor", "roof"]:
+                component_upper = component.upper()
+
+                df = df.merge(
+                    pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+                    how="left",
+                    left_on=f"{component_upper}_DESCRIPTION_STARTING",
+                    right_on="original_description",
+                ).merge(
+                    pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+                    how="left",
+                    left_on=f"{component_upper}_DESCRIPTION_ENDING",
+                    right_on="original_description",
+                    suffixes=("", "_ENDING")
+                )
+
+                if component == "walls":
+                    # We make sure the wall construction hasn't changed
+                    df = df[
+                        (df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
+                        (df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
+                        (df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
+                        (df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
+                        (df["is_cob"] == df["is_cob_ENDING"]) &
+                        (df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
+                        ]
+                elif component == "floor":
+                    df = df[
+                        (df["is_suspended"] == df["is_suspended_ENDING"]) &
+                        (df["is_solid"] == df["is_solid_ENDING"]) &
+                        (df["another_property_below"] == df["another_property_below_ENDING"]) &
+                        (df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
+                        ]
+                else:
+                    df = df[
+                        (df["is_pitched"] == df["is_pitched_ENDING"]) &
+                        (df["is_roof_room"] == df["is_roof_room_ENDING"]) &
+                        (df["is_loft"] == df["is_loft_ENDING"]) &
+                        (df["is_flat"] == df["is_flat_ENDING"]) &
+                        (df["is_thatched"] == df["is_thatched_ENDING"]) &
+                        (df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
+                        (df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
+                        ]
+
+                # Drop the binary indicators and replace the original description with the cleaned version
+
+                # Drop original cols
+                original_cols = [
+                    f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
+                ]
+
+                df = df.drop(
+                    columns=cols_to_drop[component] + original_cols
+                ).rename(
+                    columns={
+                        "clean_description": f"{component_upper}_DESCRIPTION_STARTING",
+                        "clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
+                    }
+                )
+
+            return df
+
+        data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
+
        dataset.append(data_by_urpn_df)

        cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]