diff --git a/.idea/Model.iml b/.idea/Model.iml
index 05b9012b..b03b31b1 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 3b05c6ac..ca0e1cd9 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>
diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py
index 8a03b553..c094c085 100644
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"
 
 RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
 HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
+CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
 
 
 def ordinal(n):
diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py
index 42317edd..b3961ce1 100644
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@@ -1,5 +1,6 @@
 import pandas as pd
 from tqdm import tqdm
+import msgpack
 
 from pathlib import Path
 from model_data.simulation_system.core.Settings import (
@@ -9,20 +10,42 @@ from model_data.simulation_system.core.Settings import (
     RDSAP_RESPONSE,
     HEAT_DEMAND_RESPONSE,
     COLUMNS_TO_MERGE_ON,
-    EARLIEST_EPC_DATE
+    EARLIEST_EPC_DATE,
+    CARBON_RESPONSE,
 )
 from model_data.simulation_system.core.DataProcessor import DataProcessor
-from utils.s3 import save_dataframe_to_s3_parquet
+from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
 
 DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
 
 
+def get_cleaned():
+    """
+    This function will retrieve the cleaned dataset from s3 which has the cleaned
+    descriptions for the epc dataset
+
+    This data is stored in MessagePack format and therefore needs to be decoded
+    :return:
+    """
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    return cleaned
+
+
 def app():
     # Get all the files in the directory
 
     # Data glossary:
     # https://epc.opendatacommunities.org/docs/guidance#glossary
 
+    cleaned_lookup = get_cleaned()
+
     # List all subdirectories
     directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
 
@@ -84,7 +107,7 @@ def app():
             # We include the lodgement date here as we probably need to factor time into the
             # model, since EPC standards and rigour have changed over time
             variable_data = modified_property_data[
-                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
+                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
                 ]
 
             # Note: we look at changes between subsequent EPCS, however we could look at other permutations
@@ -104,24 +127,29 @@ def app():
                 if gets_better:
                     starting_sap = earliest_record[RDSAP_RESPONSE]
                     starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
+                    starting_carbon = earliest_record[CARBON_RESPONSE]
+
                     rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
                     heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
-                else:
-                    starting_sap = latest_record[RDSAP_RESPONSE]
-                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
-                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
-                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+                    carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
 
-                if rdsap_change == 0:
-                    continue
-
-                if gets_better:
                     starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                     ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
                 else:
+                    starting_sap = latest_record[RDSAP_RESPONSE]
+                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
+                    starting_carbon = latest_record[CARBON_RESPONSE]
+
+                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
+                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+                    carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
+
                     starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                     ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
 
+                if rdsap_change == 0:
+                    continue
+
                 features = pd.concat([starting_record, ending_record])
 
                 property_model_data.append(
@@ -129,8 +157,10 @@ def app():
                         "UPRN": uprn,
                         "RDSAP_CHANGE": rdsap_change,
                         "HEAT_DEMAND_CHANGE": heat_demand_change,
-                        "STARTING_SAP": starting_sap,
-                        "STARTING_HEAT_DEMAND": starting_heat_demand,
+                        "CARBON_CHANGE": carbon_change,
+                        "SAP_STARTING": starting_sap,
+                        "HEAT_DEMAND_STARTING": starting_heat_demand,
+                        "CARBON_STARTING": starting_carbon,
                         **fixed_data,
                         **features.to_dict(),
                     }
@@ -152,6 +182,125 @@ def app():
         #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
         #       within descriptions
 
+        # We look for key building fabric features that have changed from one EPC to the next.
+        # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
+        # remove this record, as it indicates that the quality of the EPC conducted in the first instance
+        # is low
+        # We also replace descriptions with their cleaned variants
+
+        def process_and_prune_desriptions(df, cleaned_lookup):
+
+            # TODO: In a future iteration, we can test using the binary features and the insulation thickness
+            #       estimates, we well as estimated U-values
+
+            cols_to_drop = {
+                "walls": [
+                    'original_description', 'thermal_transmittance',
+                    'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
+                    'is_solid_brick', 'is_system_built', 'is_timber_frame',
+                    'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
+                    'is_sandstone_or_limestone', 'insulation_thickness',
+                    'external_insulation', 'internal_insulation',
+                    'original_description_ENDING',
+                    'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+                    'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
+                    'is_solid_brick_ENDING', 'is_system_built_ENDING',
+                    'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
+                    'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
+                    'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
+                    'external_insulation_ENDING', 'internal_insulation_ENDING',
+                ],
+                "floor": [
+                    'original_description', 'thermal_transmittance',
+                    'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
+                    'is_to_external_air', 'is_suspended', 'is_solid',
+                    'another_property_below', 'insulation_thickness', 'no_data',
+                    'original_description_ENDING',
+                    'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+                    'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
+                    'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
+                    'another_property_below_ENDING', 'insulation_thickness_ENDING',
+                    'no_data_ENDING',
+                ],
+                "roof": [
+                    'original_description', 'clean_description', 'thermal_transmittance',
+                    'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
+                    'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
+                    'has_dwelling_above', 'is_valid', 'insulation_thickness',
+                    'original_description_ENDING', 'clean_description_ENDING',
+                    'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+                    'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
+                    'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
+                    'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
+                    'insulation_thickness_ENDING',
+                ]
+
+            }
+
+            for component in ["walls", "floor", "roof"]:
+                component_upper = component.upper()
+
+                df = df.merge(
+                    pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+                    how="left",
+                    left_on=f"{component_upper}_DESCRIPTION_STARTING",
+                    right_on="original_description",
+                ).merge(
+                    pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+                    how="left",
+                    left_on=f"{component_upper}_DESCRIPTION_ENDING",
+                    right_on="original_description",
+                    suffixes=("", "_ENDING")
+                )
+
+                if component == "walls":
+                    # We make sure the wall construction hasn't changed
+                    df = df[
+                        (df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
+                        (df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
+                        (df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
+                        (df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
+                        (df["is_cob"] == df["is_cob_ENDING"]) &
+                        (df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
+                        ]
+                elif component == "floor":
+                    df = df[
+                        (df["is_suspended"] == df["is_suspended_ENDING"]) &
+                        (df["is_solid"] == df["is_solid_ENDING"]) &
+                        (df["another_property_below"] == df["another_property_below_ENDING"]) &
+                        (df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
+                        ]
+                else:
+                    df = df[
+                        (df["is_pitched"] == df["is_pitched_ENDING"]) &
+                        (df["is_roof_room"] == df["is_roof_room_ENDING"]) &
+                        (df["is_loft"] == df["is_loft_ENDING"]) &
+                        (df["is_flat"] == df["is_flat_ENDING"]) &
+                        (df["is_thatched"] == df["is_thatched_ENDING"]) &
+                        (df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
+                        (df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
+                        ]
+
+                # Drop the binary indicators and replace the original description with the cleaned version
+
+                # Drop original cols
+                original_cols = [
+                    f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
+                ]
+
+                df = df.drop(
+                    columns=cols_to_drop[component] + original_cols
+                ).rename(
+                    columns={
+                        "clean_description": f"{component_upper}_DESCRIPTION_STARTING",
+                        "clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
+                    }
+                )
+
+            return df
+
+        data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
+
         dataset.append(data_by_urpn_df)
 
         cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]