diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py
index a77bcaa3..2494497d 100644
--- a/etl/epc/DataProcessor.py
+++ b/etl/epc/DataProcessor.py
@@ -56,8 +56,11 @@ construction_age_remap = {
 
 expanded_map = {
     i: [
-        label for label, bounds in construction_age_bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
-    ][0] for i in range(0, 3001)
+        label
+        for label, bounds in construction_age_bounds_map.items()
+        if (i <= bounds["u"]) and (i >= bounds["l"])
+    ][0]
+    for i in range(0, 3001)
 }
 
 
@@ -74,8 +77,13 @@ class EPCDataProcessor:
     Handle data loading and data preprocessing
     """
 
-    def __init__(self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None,
-                 run_mode: str = "training", violation_mode: bool = False) -> None:
+    def __init__(
+        self,
+        data: pd.DataFrame | None = None,
+        cleaning_averages: pd.DataFrame | None = None,
+        run_mode: str = "training",
+        violation_mode: bool = False,
+    ) -> None:
         """
         :param filepath: If specified, is the physical location of the data
         :param is_newdata: Indicates if we are processing new, testing data.
@@ -86,7 +94,9 @@ class EPCDataProcessor:
         self.data: pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame()
 
         is_cleaning_averages_a_dataframe = isinstance(cleaning_averages, pd.DataFrame)
-        self.cleaning_averages: pd.DataFrame = cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame()
+        self.cleaning_averages: pd.DataFrame = (
+            cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame()
+        )
 
         # FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA
         self.violation_mode = violation_mode
@@ -103,7 +113,9 @@ class EPCDataProcessor:
         ignore_step = True if self.run_mode == "newdata" else False
 
         if filepath is not None:
-            self.load_data(filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
+            self.load_data(
+                filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"]
+            )
 
         if len(self.data) == 0:
             raise Exception("No data to process - check filepath/ data being passed in")
@@ -114,24 +126,53 @@ class EPCDataProcessor:
         self.remap_build_form()
         self.cast_data_column_values_to_lower()
         self.standardise_construction_age_band(ignore_step=ignore_step)
-        self.clean_missing_rooms(ignore_step=ignore_step)
+
+        # TEST: Lets no impute any missing rooms to test
+        self.data = self.data[~self.data["NUMBER_HEATED_ROOMS"].isnull()]
+        self.data = self.data[~self.data["NUMBER_HABITABLE_ROOMS"].isnull()]
+        # self.clean_missing_rooms(ignore_step=ignore_step)
+
         self.recast_df_columns(
             column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
         )
-        self.clean_multi_glaze_proportion(ignore_step=ignore_step)
+
+        # TEST: Drop the cleaning of multi glaze proportion
+        self.data = self.data[~self.data["MULTI_GLAZE_PROPORTION"].isnull()]
+        # self.clean_multi_glaze_proportion(ignore_step=ignore_step)
+
+        # TEST: drop the cleaning of photo supply - we lose a lot of data
+        # self.data = self.data[~self.data["PHOTO_SUPPLY"].isnull()]
         self.clean_photo_supply()
+
+        # TEST: For the na_remapping, we can remove all nas before the retain_multiple_epc_properties step
+        for col in fill_na_map.keys():
+            self.data = self.data[~self.data[col].isnull()]
+
+        # TEST: Need to remove floor height, total floor area and FIXED_LIGHTING_OUTLETS_COUNT from the fill_na_fields
+        self.data = self.data[~self.data["FLOOR_HEIGHT"].isnull()]
+        self.data = self.data[~self.data["TOTAL_FLOOR_AREA"].isnull()]
+        self.data = self.data[~self.data["FIXED_LIGHTING_OUTLETS_COUNT"].isnull()]
+
         self.retain_multiple_epc_properties(
-            epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"], ignore_step=ignore_step
+            epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"],
+            ignore_step=ignore_step,
         )
 
-        self.fill_na_fields()
+        # TEST: Should be no need to fill na fields
+        # self.fill_na_fields()
 
         self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step)
 
         # Final re-casting after data transformed and prepared
         self.recast_df_columns(column_mappings=COLUMNTYPES, auto_subset_columns=True)
         self.recast_all_data(column_mappings=COLUMNTYPES, auto_subset_columns=True)
-        self.na_remapping(auto_subset_columns=True)
+
+        # TEST: Remove this step
+        # self.na_remapping(auto_subset_columns=True)
+
+        if len(self.data) == 0:
+            self.cast_data_columns_to_lower()
+            return
 
         self.fill_invalid_constituency_fields(ignore_step=ignore_step)
 
@@ -151,11 +192,13 @@ class EPCDataProcessor:
         if self.run_mode == "newdata":
             cleaning_averages.columns = cleaning_averages.columns.str.upper()
 
-        cleaned_data = self.apply_averages_cleaning(
-            data_to_clean=self.data,
-            cleaning_data=cleaning_averages,
-            cols_to_merge_on=COLUMNS_TO_MERGE_ON,
-        )
+        # TEST: Remove this step
+        cleaned_data = None
+        # cleaned_data = self.apply_averages_cleaning(
+        #     data_to_clean=self.data,
+        #     cleaning_data=cleaning_averages,
+        #     cols_to_merge_on=COLUMNS_TO_MERGE_ON,
+        # )
 
         self.data = self.data if cleaned_data is None else cleaned_data
 
@@ -188,7 +231,9 @@ class EPCDataProcessor:
         if ignore_step:
             return
 
-        self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
+        self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[
+            0
+        ]
 
     def fill_invalid_constituency_fields(self, ignore_step: bool = False):
         """
@@ -201,7 +246,9 @@ class EPCDataProcessor:
         if ignore_step:
             return
 
-        self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]})
+        self.data = self.data.fillna(
+            {"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]}
+        )
 
     def sort_data_by_uprn_lodgement_date(self, ignore_step: bool = False):
         """
@@ -301,7 +348,7 @@ class EPCDataProcessor:
         """
 
         if self.violation_mode:
-            # TODO: to fill in 
+            # TODO: to fill in
             return
 
         if ignore_step:
@@ -311,9 +358,7 @@ class EPCDataProcessor:
             lambda x: self.clean_construction_age_band(x)
         )
 
-        self.data = self.data[
-            ~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])
-        ]
+        self.data = self.data[~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])]
 
     def clean_missing_rooms(self, ignore_step: bool = False):
         """
@@ -331,31 +376,45 @@ class EPCDataProcessor:
             return
 
         # TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning)
-        self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
+        self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(
+            lambda x: x.split(" ")[0]
+        )
 
         def apply_clean(data, matching_columns):
 
-            cleaning_data = data[~pd.isnull(data[col])].groupby(
-                matching_columns
-            )[col].median().reset_index()
-
-            data = data.merge(
-                cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING")
+            cleaning_data = (
+                data[~pd.isnull(data[col])]
+                .groupby(matching_columns)[col]
+                .median()
+                .reset_index()
             )
 
-            data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col])
+            data = data.merge(
+                cleaning_data,
+                how="left",
+                on=matching_columns,
+                suffixes=("", "_CLEANING"),
+            )
+
+            data[col] = np.where(
+                pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col]
+            )
             data = data.drop(columns=f"{col}_CLEANING")
             return data
 
         for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
 
             to_index = 3
-            matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "POSTAL_AREA"]
+            matching_columns = [
+                "PROPERTY_TYPE",
+                "BUILT_FORM",
+                "CONSTRUCTION_AGE_BAND",
+                "POSTAL_AREA",
+            ]
             has_missings = pd.isnull(self.data[col]).sum()
             while has_missings:
                 self.data = apply_clean(
-                    data=self.data,
-                    matching_columns=matching_columns[0:to_index + 1]
+                    data=self.data, matching_columns=matching_columns[0 : to_index + 1]
                 )
                 has_missings = pd.isnull(self.data[col]).sum()
 
@@ -363,7 +422,10 @@ class EPCDataProcessor:
                     # Check if we've gotten to index 0 and still have missings - something has gone wrong or
                     # we have a very unique property type
                     if has_missings:
-                        raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col)
+                        raise NotImplementedError(
+                            "Handle this edge case, we still have missings for column %s"
+                            % col
+                        )
 
                     break
                 to_index -= 1
@@ -410,7 +472,7 @@ class EPCDataProcessor:
     #     coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else
     #     COLUMNTYPES
     #     for k, v in coltypes.items():
-    #         self.data[k] = self.data[k].astype(v) 
+    #         self.data[k] = self.data[k].astype(v)
     #     self.data = self.data.astype(coltypes)
 
     #     self.na_remapping()
@@ -437,9 +499,11 @@ class EPCDataProcessor:
 
     def na_remapping(self, auto_subset_columns: bool = False):
 
-        fill_na_map_apply = {
-            k: v for k, v in fill_na_map.items() if k in self.data.columns
-        } if auto_subset_columns else fill_na_map
+        fill_na_map_apply = (
+            {k: v for k, v in fill_na_map.items() if k in self.data.columns}
+            if auto_subset_columns
+            else fill_na_map
+        )
 
         for column, fill_value in fill_na_map_apply.items():
             self.data[column] = self.data[column].fillna(fill_value)
@@ -535,28 +599,34 @@ class EPCDataProcessor:
 
         for variable in AVERAGE_FIXED_FEATURES:
             # Replace any missing NAN values with averages for the same Property type and built form
-            cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
-                cleaning_averages_filled[f"{variable}_AVERAGE"]
-            )
+            cleaning_averages_filled[variable] = cleaning_averages_filled[
+                variable
+            ].fillna(cleaning_averages_filled[f"{variable}_AVERAGE"])
 
-            cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_AVERAGE")
+            cleaning_averages_filled = cleaning_averages_filled.drop(
+                columns=f"{variable}_AVERAGE"
+            )
 
             #  If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope
             #  and built form
             #  We can use just the property type average and replace
 
-            cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
-                cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"]
-            )
+            cleaning_averages_filled[variable] = cleaning_averages_filled[
+                variable
+            ].fillna(cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"])
 
-            cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_PROPERTY_AVERAGE")
+            cleaning_averages_filled = cleaning_averages_filled.drop(
+                columns=f"{variable}_PROPERTY_AVERAGE"
+            )
 
             # If there are still NA values, use BUILT FORM averages
-            cleaning_averages_filled["variable"] = cleaning_averages_filled[variable].fillna(
-                cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"]
-            )
+            cleaning_averages_filled["variable"] = cleaning_averages_filled[
+                variable
+            ].fillna(cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"])
 
-            cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
+            cleaning_averages_filled = cleaning_averages_filled.drop(
+                columns=f"{variable}_BUILT_FORM_AVERAGE"
+            )
 
             # If there still is na values, use average across all epc in consituecy
             cleaning_averages_filled[variable] = cleaning_averages_filled[
@@ -573,7 +643,9 @@ class EPCDataProcessor:
 
         self.cleaning_averages = cleaning_averages_filled
 
-    def retain_multiple_epc_properties(self, epc_minimum_count: int = 1, ignore_step: bool = False) -> None:
+    def retain_multiple_epc_properties(
+        self, epc_minimum_count: int = 1, ignore_step: bool = False
+    ) -> None:
         """
         Reduce the data futher by keeping only datasets with multiple epcs
         """
@@ -592,12 +664,16 @@ class EPCDataProcessor:
         counts = counts[counts["count"] > epc_minimum_count]
         self.data = pd.merge(self.data, counts, on="UPRN")
 
-    def recast_df_columns(self, column_mappings: dict, auto_subset_columns: bool = False) -> None:
+    def recast_df_columns(
+        self, column_mappings: dict, auto_subset_columns: bool = False
+    ) -> None:
         """
         Recast columns from the dataframe to ensure the behaviour we want
         """
         if auto_subset_columns:
-            column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns}
+            column_mappings = {
+                k: v for k, v in column_mappings.items() if k in self.data.columns
+            }
 
         for key, values in column_mappings.items():
             if key not in self.data.columns:
@@ -608,13 +684,17 @@ class EPCDataProcessor:
             else:
                 self.data[key] = self.data[key].astype(values)
 
-    def recast_all_data(self, column_mappings: dict, auto_subset_columns: bool = False) -> None:
+    def recast_all_data(
+        self, column_mappings: dict, auto_subset_columns: bool = False
+    ) -> None:
         """
         Using a dictionary to recast all columns at once
         """
 
         if auto_subset_columns:
-            column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns}
+            column_mappings = {
+                k: v for k, v in column_mappings.items() if k in self.data.columns
+            }
 
         self.data = self.data.astype(column_mappings)
 
@@ -625,14 +705,28 @@ class EPCDataProcessor:
 
         if self.violation_mode:
             violation_uprn_missing = pd.isnull(self.data["UPRN"])
-            violation_old_lodgment_date = self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE
-            violation_invalid_transaction_type = self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES
-            violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)
+            violation_old_lodgment_date = (
+                self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE
+            )
+            violation_invalid_transaction_type = (
+                self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES
+            )
+            violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin(
+                IGNORED_FLOOR_LEVELS
+            )
             violation_rdsap_score_above_max = self.data[RDSAP_RESPONSE] > MAX_SAP_SCORE
-            violation_missing_windows_description = pd.isnull(self.data["WINDOWS_DESCRIPTION"])
-            violation_missing_hotwater_description = pd.isnull(self.data["HOTWATER_DESCRIPTION"])
-            violation_missing_roof_description = pd.isnull(self.data["ROOF_DESCRIPTION"])
-            violation_invalid_property_type = self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES
+            violation_missing_windows_description = pd.isnull(
+                self.data["WINDOWS_DESCRIPTION"]
+            )
+            violation_missing_hotwater_description = pd.isnull(
+                self.data["HOTWATER_DESCRIPTION"]
+            )
+            violation_missing_roof_description = pd.isnull(
+                self.data["ROOF_DESCRIPTION"]
+            )
+            violation_invalid_property_type = (
+                self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES
+            )
             violation_invalid_tenure = self.data["TENURE"].isin(IGNORED_TENURES)
 
             violation_df = pd.concat(
@@ -647,7 +741,8 @@ class EPCDataProcessor:
                     violation_missing_roof_description,
                     violation_invalid_property_type,
                     violation_invalid_tenure,
-                ], axis=1,
+                ],
+                axis=1,
                 keys=[
                     "violation_uprn_missing",
                     "violation_old_lodgment_date",
@@ -658,8 +753,8 @@ class EPCDataProcessor:
                     "violation_missing_hotwater_description",
                     "violation_missing_roof_description",
                     "violation_invalid_property_type",
-                    "violation_invalid_tenure"
-                ]
+                    "violation_invalid_tenure",
+                ],
             )
 
             self.data = pd.concat([self.data, violation_df], axis=1)
@@ -685,10 +780,10 @@ class EPCDataProcessor:
 
         self.data = self.data[~pd.isnull(self.data["UPRN"])]
         self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
-        self.data = self.data[self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES]
         self.data = self.data[
-            ~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)
+            self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES
         ]
+        self.data = self.data[~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)]
         self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE]
 
         # We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them
@@ -705,7 +800,7 @@ class EPCDataProcessor:
         self.data = self.data[~self.data["TENURE"].isin(IGNORED_TENURES)]
 
         # We remap zero values to None
-        self.data.loc[self.data['FLOOR_HEIGHT'] == 0, 'FLOOR_HEIGHT'] = None
+        self.data.loc[self.data["FLOOR_HEIGHT"] == 0, "FLOOR_HEIGHT"] = None
 
     def clean_multi_glaze_proportion(self, ignore_step: bool = False) -> None:
         """
@@ -734,7 +829,11 @@ class EPCDataProcessor:
 
     @staticmethod
     def apply_averages_cleaning(
-        data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False
+        data_to_clean,
+        cleaning_data,
+        cols_to_merge_on,
+        colnames=None,
+        ignore_step: bool = False,
     ):
         """
         Clean the input DataFrame using averages from a cleaning DataFrame.
@@ -752,12 +851,13 @@ class EPCDataProcessor:
 
         # The desired colnames to clean - which may not be present
         if colnames is None:
-            colnames = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT"]
+            colnames = [
+                "TOTAL_FLOOR_AREA",
+                "FLOOR_HEIGHT",
+                "FIXED_LIGHTING_OUTLETS_COUNT",
+            ]
 
-        cols_to_clean = [
-            c for c in colnames if
-            c in data_to_clean.columns
-        ]
+        cols_to_clean = [c for c in colnames if c in data_to_clean.columns]
 
         # Enforce data types
         for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]:
@@ -768,7 +868,15 @@ class EPCDataProcessor:
 
         # Calculate averages
         cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg(
-            dict(zip(cols_to_clean, ["mean", ] * len(cols_to_clean)))
+            dict(
+                zip(
+                    cols_to_clean,
+                    [
+                        "mean",
+                    ]
+                    * len(cols_to_clean),
+                )
+            )
         )
 
         # Merge with the original data
@@ -777,7 +885,7 @@ class EPCDataProcessor:
             cleaning_averages_to_merge,
             on=columns_to_merge_on,
             suffixes=("", "_AVERAGE"),
-            how='left'
+            how="left",
         )
 
         global_averages = cleaning_data[cols_to_clean].mean()
@@ -806,14 +914,20 @@ class EPCDataProcessor:
             raise Exception("Suffix should be one of _starting or _ending")
 
         if suffix == "_STARTING":
-            starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES].copy().add_suffix(suffix)
+            starting_cols = (
+                self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES]
+                .copy()
+                .add_suffix(suffix)
+            )
             fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS + POTENTIAL_COLUMNS].copy()
 
             return pd.concat([starting_cols, fixed_cols], axis=1)
 
-        return self.data[
-            ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES
-            ].copy().add_suffix(suffix)
+        return (
+            self.data[ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES]
+            .copy()
+            .add_suffix(suffix)
+        )
 
     def get_fixed_features(self) -> pd.DataFrame:
         """
@@ -831,14 +945,17 @@ class EPCDataProcessor:
         :param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
         :return: DataFrame with coerced columns.
         """
-        object_columns = df.select_dtypes(include=['object']).columns
+        object_columns = df.select_dtypes(include=["object"]).columns
         if cols_to_ignore:
             object_columns = [c for c in object_columns if c not in cols_to_ignore]
 
         for column in object_columns:
             unique_values = df[column].dropna().unique()
             # If the unique values in the column are 'True' and 'False', convert the column to boolean
-            if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}:
+            if set(unique_values) == {"True", "False"} or set(unique_values) == {
+                True,
+                False,
+            }:
                 df[column] = df[column].astype(bool)
 
         return df
@@ -877,7 +994,6 @@ class EPCDataProcessor:
 
     @staticmethod
     def clean_efficiency_variables(df):
-
         """
         These is scope to clean this by the model per corresponding description.
         E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py
index 6abf05bd..665d9320 100644
--- a/etl/epc/Pipeline.py
+++ b/etl/epc/Pipeline.py
@@ -83,9 +83,9 @@ class EPCPipeline:
         run_mode="training",
         epc_local_file="certificates.csv",
         epc_bucket_name="retrofit-data-dev",
-        epc_cleaning_dataset_key="sap_change_model/{}/cleaning_dataset_rooms.parquet",
-        epc_all_equal_rows_key="sap_change_model/{}/all_equal_rows_rooms.parquet",
-        epc_compiled_dataset_key="sap_change_model/{}/dataset_rooms.parquet",
+        epc_cleaning_dataset_key="sap_change_model/{}/cleaning_dataset_no_cleaning.parquet",
+        epc_all_equal_rows_key="sap_change_model/{}/all_equal_rows_no_cleaning.parquet",
+        epc_compiled_dataset_key="sap_change_model/{}/dataset_no_cleaning.parquet",
         use_parallel=False,
     ):
         """
@@ -237,6 +237,9 @@ class EPCPipeline:
             if difference_records is not None:
                 constituency_difference_records.extend(difference_records)
 
+        if len(constituency_difference_records) == 0:
+            return
+
         constituency_dataset = TrainingDataset(
             datasets=constituency_difference_records, cleaned_lookup=clean_lookup
         )
diff --git a/etl/epc/generate_scenarios_data.py b/etl/epc/generate_scenarios_data.py
index d5bece8b..986410c8 100644
--- a/etl/epc/generate_scenarios_data.py
+++ b/etl/epc/generate_scenarios_data.py
@@ -54,8 +54,18 @@ scenario_properties = [
         "postcode": "NN1 5JY",
         "lmk-key": "1459796789102016070507274146560098",
         "measures": [
-            [["internal_wall_insulation"], "11", None, [0]],
-            [["external_wall_insulation"], "10", None, [0]],
+            [
+                ["internal_wall_insulation"],
+                "11",
+                {"walls_insulation_thickness_ending": "average"},
+                [0],
+            ],
+            [
+                ["external_wall_insulation"],
+                "10",
+                {"walls_insulation_thickness_ending": "average"},
+                [0],
+            ],
             [["solar", "windows"], "12-15", {"photo_supply_ending": 50}, [0, 1]],
         ],
     },
@@ -64,7 +74,12 @@ scenario_properties = [
         "postcode": "HP1 2HA",
         "lmk-key": "c14029235739827d5f627dc8aa9bb567d026b267e851e0db0001db24638667b1",
         "measures": [
-            [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]],
+            [
+                ["cavity_wall_insulation", "loft_insulation"],
+                "15",
+                {"walls_insulation_thickness_ending": "average"},
+                [0, 1],
+            ],
         ],
     },
     {
@@ -72,7 +87,12 @@ scenario_properties = [
         "postcode": "HP1 2HE",
         "lmk-key": "99296a6dda21314fef3a61cda59e441e9a2aacf115eb96f4a0fa85696bf7b117",
         "measures": [
-            [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]],
+            [
+                ["cavity_wall_insulation", "loft_insulation"],
+                "15",
+                {"walls_insulation_thickness_ending": "average"},
+                [0, 1],
+            ],
         ],
     },
     {
@@ -80,7 +100,12 @@ scenario_properties = [
         "postcode": "HP1 2AN",
         "lmk-key": "d1e0534be3a44c33003323b21d0e322e3daddc65b5ee71936f89c59ddab96b50",
         "measures": [
-            [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]],
+            [
+                ["cavity_wall_insulation", "loft_insulation"],
+                "15",
+                {"walls_insulation_thickness_ending": "average"},
+                [0, 1],
+            ],
         ],
     },
     {
@@ -88,7 +113,12 @@ scenario_properties = [
         "postcode": "HP1 2HX",
         "lmk-key": "1eae354db522a95188018d9cd0502ed8c609910b6c88f8797d3a25f59b11770a",
         "measures": [
-            [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]],
+            [
+                ["cavity_wall_insulation", "loft_insulation"],
+                "15",
+                {"walls_insulation_thickness_ending": "average"},
+                [0, 1],
+            ],
         ],
     },
 ]
diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py
index c985567d..506c32b3 100644
--- a/etl/epc/property_change_app.py
+++ b/etl/epc/property_change_app.py
@@ -12,7 +12,7 @@ def main():
     """
 
     directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
-    # directories = directories[0:3]
+    # directories = directories[76:85]
 
     epc_pipeline = EPCPipeline(
         directories=directories,