diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/DataProcessor.py
index 2aa0fabe..477883c4 100644
--- a/model_data/simulation_system/DataProcessor.py
+++ b/model_data/simulation_system/DataProcessor.py
@@ -1,13 +1,19 @@
 from pathlib import Path
+import numpy as np
 import pandas as pd
+from model_data.BaseUtility import BaseUtility
 from simulation_system.Settings import (
     DATA_PROCESSOR_SETTINGS,
     EARLIEST_EPC_DATE,
     FULLY_GLAZED_DESCRIPTIONS,
     AVERAGE_FIXED_FEATURES,
     FLOOR_HEIGHT_NATIONAL_AVERAGE,
-    TOTAL_FLOOR_AREA_NATIONAL_AVERAGE
+    TOTAL_FLOOR_AREA_NATIONAL_AVERAGE,
+    FLOOR_LEVEL_MAP,
+    BUILT_FORM_REMAP,
+    COLUMNS_TO_MERGE_ON
     )
+from typing import List
 
 
 class DataProcessor:
@@ -32,11 +38,48 @@ class DataProcessor:
         self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
         self.clean_multi_glaze_proportion()
         self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
+        self.remap_columns()
+
+        if DATA_PROCESSOR_SETTINGS['epc_minimum_count'] >= 1:
+            # If we have multiple EPC records, we can try and do filling
+            self.fill_na_fields()
         
         self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
 
         return self.data
     
+    def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
+        """
+        If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields
+        """
+        # Each uprn can fille backward from recent and forward fill from oldest 
+        # The groupby changes the order and we use the index to make the original data
+        filled_data = self.data.groupby("UPRN", group_keys=True)[columns_to_fill].apply(
+            lambda group: group.fillna(method='bfill').fillna(method='ffill')
+            ).reset_index().set_index('level_1').sort_index()
+
+        self.data[columns_to_fill] = filled_data[columns_to_fill]        
+        
+    
+    def remap_columns(self):
+        """
+        Remap all columns, for any non values
+        """
+
+        # Map all anomaly values to None
+        data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
+        
+        # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
+        data = self.data.replace(data_anomaly_map)
+        data = data.replace(np.NAN, None)
+        
+        # Remap certain columns
+        data['FLOOR_LEVEL'] = data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
+        data['BUILT_FROM'] = data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
+
+        self.data = data
+
+
     def make_cleaning_averages(self) -> pd.DataFrame:
         # Define a custom function to calculate the median, excluding missing values
         def median_without_missing(group):
diff --git a/model_data/simulation_system/Settings.py b/model_data/simulation_system/Settings.py
index 04e11c25..1d302abf 100644
--- a/model_data/simulation_system/Settings.py
+++ b/model_data/simulation_system/Settings.py
@@ -4,6 +4,14 @@
 TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
 
+COLUMNS_TO_MERGE_ON = [
+    "PROPERTY_TYPE", 
+    "BUILT_FORM", 
+    "CONSTRUCTION_AGE_BAND", 
+    "NUMBER_HABITABLE_ROOMS",
+    "NUMBER_HEATED_ROOMS"
+    ]
+
 FULLY_GLAZED_DESCRIPTIONS = [
     "Fully double glazed",
     "High performance glazing",
@@ -111,4 +119,5 @@ DATA_PROCESSOR_SETTINGS = {
     'low_memory': False,
     'epc_minimum_count': 1,
     'column_mappings': {'UPRN': [int, str]}
-}
\ No newline at end of file
+}
+
diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py
index 1037da14..517460b0 100644
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@@ -10,6 +10,7 @@ from model_data.simulation_system.Settings import (
     COMPONENT_FEATURES, 
     RDSAP_RESPONSE,
     HEAT_DEMAND_RESPONSE,
+    COLUMNS_TO_MERGE_ON,
     FLOOR_LEVEL_MAP,
     BUILT_FORM_REMAP
 )
@@ -27,7 +28,9 @@ def app():
     directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
 
     dataset = []
-
+    # 116 
+    # 128048706
+    # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic-certificates/domestic-E09000021-Kingston-upon-Thames')
     for directory in tqdm(directories): 
 
         filepath = directory / "certificates.csv"
@@ -46,42 +49,21 @@ def app():
             if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
                 continue
 
-            # Map all anomaly values to None
-            data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
-            
-            # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
-            modified_property_data = property_data.replace(data_anomaly_map)
-            modified_property_data = modified_property_data.replace(np.NAN, None)
-            
-            # Remap certain columns
-            modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
-            modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
-
             # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS 
-            latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
-            mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
+            latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
+            mandatory_field_data = property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
 
             # Taking just the last row, which is the percentage change from the latest to previous one only
-            # modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
+            # property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
 
-            # We can replace any NA values for Average fixed features
-            # We have columns that we want to merge on, but some of these columns are all NA values
-            # So we determine which columns to merge on, and get the equivalent grouping in the averages
-            columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
-                         "NUMBER_HEATED_ROOMS"]
-            
-            if modified_property_data[columns_to_merge_on].isna().values.any():
-                # If there are any NA value, back fill first (i.e most recent), then forward fill if needed
-                modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
-            
             # Extract the columns that are not all None
-            na_columns = modified_property_data[columns_to_merge_on].isna().all()
-            columns_to_merge_on = na_columns.index[~na_columns].to_list()
+            na_columns = property_data[COLUMNS_TO_MERGE_ON].isna().all()
+            cleaned_columns_to_merge_on = na_columns.index[~na_columns].to_list()
 
             #  Get the corresponding groupby and merge, and fill in NA values
-            cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
+            cleaning_averages_to_merge = cleaning_averages.groupby(cleaned_columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
             
-            modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
+            modified_property_data = pd.merge(property_data, cleaning_averages_to_merge, on=cleaned_columns_to_merge_on, suffixes=['', '_AVERAGE'])
             modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
             modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
             modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
@@ -95,8 +77,10 @@ def app():
                     if abs(vals[0] - vals[1]) / vals[0] > 0.1:
                         # Take the more recent value since it's likely to be more accurate
                         vals = [vals[-1]]
-         
 
+                if len(vals) == 0:
+                    wrong_var
+         
                 fixed_data[field] = np.mean(vals)
 
             #Combine all fields together
diff --git a/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet b/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
new file mode 100644
index 00000000..ac5249ce
Binary files /dev/null and b/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet differ
diff --git a/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet b/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet
new file mode 100644
index 00000000..e7b2eb4a
Binary files /dev/null and b/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet differ
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index cde310a3..da2c6f4a 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -99,18 +99,30 @@ def training(train_filepath: str, test_filepath: str) -> None:
     # logger.info('Split data into train and validation')
 
     logger.info('Build Model')
-    data = TabularDataset(data=train_df)
+    
+    data = TabularDataset(data=train_filepath)
+    data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
+    TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
+    # top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
+
+    data = data[['RDSAP_CHANGE'] + top_features.to_list()]
+    # data = TabularDataset(data=train_df)
     # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
-    subsample_size = round(len(data)/4)
+    subsample_size = round(len(data)/20)
     data = data.sample(subsample_size, random_state=RANDOM_SEED)
 
+    # Add custom metric class MAPE
+    # Have a look at temporal features
+
     target_column = 'RDSAP_CHANGE'
     predictor_RDSAP = TabularPredictor(
         label=target_column, 
         path="agModels-predictRDSAP", 
         problem_type="regression",
         eval_metric='mean_absolute_error'
-        ).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
+        ).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
+
+
 
     logger.info('Evaluate matrics')