Merge pull request #126 from Hestia-Homes/michael-initial

Michael initial
2026-07-27 23:35:01 +00:00 · 2023-08-14 18:46:19 +01:00 · 2023-08-14 18:46:19 +01:00 · 7df05e4691
commit 7df05e4691
parent 773bd291a1 31832c7c26
8 changed files with 684 additions and 234 deletions
--- a/model_data/simulation_system/DataProcessor.py
+++ b/model_data/simulation_system/DataProcessor.py
@ -0,0 +1,142 @@
+from pathlib import Path
+import pandas as pd
+from simulation_system.Settings import (
+    DATA_PROCESSOR_SETTINGS,
+    EARLIEST_EPC_DATE,
+    FULLY_GLAZED_DESCRIPTIONS,
+    AVERAGE_FIXED_FEATURES,
+    FLOOR_HEIGHT_NATIONAL_AVERAGE,
+    TOTAL_FLOOR_AREA_NATIONAL_AVERAGE
+    )
+
+
+class DataProcessor:
+    """
+    Handle data loading and data preprocessing
+    """
+
+    def __init__(self, filepath: Path) -> None:
+        self.filepath = filepath
+
+    def load_data(self, low_memory=False) -> None:
+        self.data = pd.read_csv(self.filepath, low_memory=low_memory)
+
+    def pre_process(self) -> pd.DataFrame:
+        """
+        Load data and begin initial cleaning
+        """
+        self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
+        self.confine_data()
+        self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
+        self.clean_multi_glaze_proportion()
+        self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
+        
+        self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+
+        return self.data
+    
+    def make_cleaning_averages(self) -> pd.DataFrame:
+        # Define a custom function to calculate the median, excluding missing values
+        def median_without_missing(group):
+            return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
+
+        cleaning_averages = self.data.groupby(
+            ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+            observed=True
+        ).apply(median_without_missing).reset_index()
+
+        general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
+            median_without_missing).reset_index()
+        
+        property_averages = self.data.groupby(["PROPERTY_TYPE"], observed=True).apply(
+            median_without_missing).reset_index()
+        
+        built_form_averages = self.data.groupby(["BUILT_FORM"], observed=True).apply(
+            median_without_missing).reset_index()
+        
+        # We can clean up any NA's in the cleaning averages with the general averages here
+        cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], suffixes=['', '_AVERAGE'])
+        cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], suffixes=['', '_PROPERTY_AVERAGE'])
+        cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], suffixes=['', '_BUILT_FORM_AVERAGE'])
+
+        # Replace any missing NAN values with averages for the same Property type and built form
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE'])
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE'])
+        cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
+
+        #  If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope and built form
+        #  We can use just the property type average and replace
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE'])
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE'])
+        cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE'])
+
+        # If there are still NA values, use BUILT FORM averages
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE'])
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
+        cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
+
+        # If there still is na values, use average across all properties in consituecy
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean())
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT'].mean())
+
+        # If the consituency is all NA values, then take UK AVERAGE VALUES
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
+
+        return cleaning_averages_filled
+
+    def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
+        '''
+        Reduce the data futher by keeping only datasets with multiple epcs
+        '''
+
+        counts = self.data.groupby("UPRN").size().reset_index()
+        counts.columns = ["UPRN", "count"]
+
+        # take UPRNS with multiple EPCs
+        counts = counts[counts["count"] > epc_minimum_count]
+        self.data = pd.merge(self.data, counts, on='UPRN')
+
+    
+    def recast_df_columns(self, column_mappings: dict) -> None:
+        """
+        Recast columns from the dataframe to ensure the behaviour we want
+        """
+
+        for key, values in column_mappings.items():
+            if key not in self.data.columns:
+                print('Column mapping incorrectly specified')
+                exit(1)
+            for value in values:
+                self.data[key] = self.data[key].astype(value)
+
+
+    def confine_data(self) -> None:
+        """
+        Include all step to reduce down the data based on assumptions
+        """
+
+        # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
+
+        # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
+        # before the introduction of SAP09
+
+        # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
+        # full SAP, which produces different results to the RdSAP methodology
+
+        # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
+
+        self.data = self.data[~pd.isnull(self.data["UPRN"])]
+        self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
+        self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
+        self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
+
+    
+    def clean_multi_glaze_proportion(self) -> None:
+        """
+        If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
+        """
+
+        no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
+        self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
+
--- a/model_data/simulation_system/Logger.py
+++ b/model_data/simulation_system/Logger.py
@ -0,0 +1,22 @@
+import logging 
+
+def setup_logger():
+    # Create a logger
+    logger = logging.getLogger()
+    
+    # Set the log level
+    logger.setLevel(logging.INFO)
+    
+    # Create a formatter
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    
+    # Create a stream handler to direct logs to stdout
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+    
+    # Add the stream handler to the logger
+    logger.addHandler(stream_handler)
+    
+    return logger
+
+logger = setup_logger()
--- a/model_data/simulation_system/Settings.py
+++ b/model_data/simulation_system/Settings.py
@ -0,0 +1,114 @@
+# Using a simply python file as settings for now 
+# TODO: migrate to dynaconf
+
+TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
+FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
+
+FULLY_GLAZED_DESCRIPTIONS = [
+    "Fully double glazed",
+    "High performance glazing",
+    "Fully triple glazed",
+    "Full secondary glazing",
+    "Multiple glazing throughout",
+]
+
+FIXED_FEATURES = [
+    'PROPERTY_TYPE',
+    'BUILT_FORM',
+    'CONSTRUCTION_AGE_BAND',
+    'NUMBER_HABITABLE_ROOMS',
+    'CONSTITUENCY',
+    'NUMBER_HEATED_ROOMS',
+    'FIXED_LIGHTING_OUTLETS_COUNT',
+    'FLOOR_HEIGHT',
+    'FLOOR_LEVEL',
+    'TOTAL_FLOOR_AREA',
+]
+
+COMPONENT_FEATURES = [
+    'TRANSACTION_TYPE',
+    'WALLS_DESCRIPTION',
+    'FLOOR_DESCRIPTION',
+    'LIGHTING_DESCRIPTION',
+    'ROOF_DESCRIPTION',
+    'MAINHEAT_DESCRIPTION',
+    'HOTWATER_DESCRIPTION',
+    'MAIN_FUEL',
+    'MECHANICAL_VENTILATION',
+    'SECONDHEAT_DESCRIPTION',
+    'ENERGY_TARIFF',  # Not sure if this is relevant
+    'SOLAR_WATER_HEATING_FLAG',
+    'PHOTO_SUPPLY',
+    'WINDOWS_DESCRIPTION',
+    'GLAZED_TYPE',
+    'MULTI_GLAZE_PROPORTION',
+    'LIGHTING_DESCRIPTION',
+    'LOW_ENERGY_LIGHTING',
+    'NUMBER_OPEN_FIREPLACES',
+    'MAINHEATCONT_DESCRIPTION',
+    'EXTENSION_COUNT',
+    # 'GLAZED_AREA',  # May not need this since we have MULTI_GLAZE_PROPORTION
+]
+
+# For these fields, we take an average if we have multiple values
+AVERAGE_FIXED_FEATURES = [
+    "TOTAL_FLOOR_AREA",
+    "FLOOR_HEIGHT"
+]
+
+# For these fields, we take the latest value if we have multiple values
+# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
+# the most accurate
+LATEST_FIELD = [
+    "NUMBER_HABITABLE_ROOMS",
+    "NUMBER_HEATED_ROOMS",
+    "FIXED_LIGHTING_OUTLETS_COUNT",
+    "FLOOR_LEVEL",
+    "CONSTRUCTION_AGE_BAND",  # This is a field we're probably want to use verisk data for
+]
+
+# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
+MANDATORY_FIXED_FEATURES = [
+    "PROPERTY_TYPE",
+    "BUILT_FORM",
+    "CONSTITUENCY"
+]
+
+# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
+# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
+# and Wales from 31 July 2014
+EARLIEST_EPC_DATE = "2014-08-01"
+
+RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
+HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
+
+def ordinal(n):
+    if 10 <= n % 100 <= 20:
+        suffix = 'th'
+    else:
+        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
+
+    return str(n) + suffix
+
+FLOOR_LEVEL_MAP = {
+    "Basement": -1,
+    "Ground": 0,
+    "ground floor": 0,
+    "20+": 20,
+    "21st or above": 21,
+    **{str(i).zfill(2): i for i in range(0, 21)},
+    **{ordinal(i): i for i in range(-1, 21)},
+    **{str(i): i for i in range(-1, 21)},
+    **{i: i for i in range(-1, 21)},
+}
+
+BUILT_FORM_REMAP = {
+    "Enclosed End-Terrace": "End-Terrace",
+    "Enclosed Mid-Terrace": "Mid-Terrace",
+}
+
+DATA_PROCESSOR_SETTINGS = {
+    'low_memory': False,
+    'epc_minimum_count': 1,
+    'column_mappings': {'UPRN': [int, str]}
+}
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@ -1,171 +1,21 @@
 import numpy as np
-import os
 import pandas as pd
 from tqdm import tqdm
 from model_data.BaseUtility import BaseUtility
+from pathlib import Path
+from model_data.simulation_system.Settings import (
+    MANDATORY_FIXED_FEATURES,
+    AVERAGE_FIXED_FEATURES, 
+    LATEST_FIELD, 
+    COMPONENT_FEATURES, 
+    RDSAP_RESPONSE,
+    HEAT_DEMAND_RESPONSE,
+    FLOOR_LEVEL_MAP,
+    BUILT_FORM_REMAP
+)
+from DataProcessor import DataProcessor

-
-def list_subdirectories(directory_path):
-    return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
-
-
-DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
-
-FIXED_FEATURES = [
-    'PROPERTY_TYPE',
-    'BUILT_FORM',
-    'CONSTRUCTION_AGE_BAND',
-    'NUMBER_HABITABLE_ROOMS',
-    'CONSTITUENCY',
-    'NUMBER_HEATED_ROOMS',
-    'FIXED_LIGHTING_OUTLETS_COUNT',
-    'FLOOR_HEIGHT',
-    'FLOOR_LEVEL',
-    'TOTAL_FLOOR_AREA',
-]
-
-COMPONENT_FEATURES = [
-    'TRANSACTION_TYPE',
-    'WALLS_DESCRIPTION',
-    'FLOOR_DESCRIPTION',
-    'LIGHTING_DESCRIPTION',
-    'ROOF_DESCRIPTION',
-    'MAINHEAT_DESCRIPTION',
-    'HOTWATER_DESCRIPTION',
-    'MAIN_FUEL',
-    'MECHANICAL_VENTILATION',
-    'SECONDHEAT_DESCRIPTION',
-    'ENERGY_TARIFF',  # Not sure if this is relevant
-    'SOLAR_WATER_HEATING_FLAG',
-    'PHOTO_SUPPLY',
-    'WINDOWS_DESCRIPTION',
-    'GLAZED_TYPE',
-    'MULTI_GLAZE_PROPORTION',
-    'LIGHTING_DESCRIPTION',
-    'LOW_ENERGY_LIGHTING',
-    'NUMBER_OPEN_FIREPLACES',
-    'MAINHEATCONT_DESCRIPTION',
-    'EXTENSION_COUNT',
-    # 'GLAZED_AREA',  # May not need this since we have MULTI_GLAZE_PROPORTION
-]
-
-# For these fields, we take an average if we have multiple values
-AVERAGE_FIXED_FEATURES = [
-    "TOTAL_FLOOR_AREA",
-    "FLOOR_HEIGHT"
-]
-
-# For these fields, we take the latest value if we have multiple values
-# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
-# the most accurate
-LATEST_FIELD = [
-    "NUMBER_HABITABLE_ROOMS",
-    "NUMBER_HEATED_ROOMS",
-    "FIXED_LIGHTING_OUTLETS_COUNT",
-    "CONSTRUCTION_AGE_BAND",
-    "FLOOR_LEVEL",
-    "CONSTRUCTION_AGE_BAND",  # This is a field we're probably want to use verisk data for
-]
-
-# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
-MANDATORY_FIXED_FEATURES = [
-    "PROPERTY_TYPE",
-    "BUILT_FORM",
-    "CONSTITUENCY"
-]
-
-# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
-# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
-# and Wales from 31 July 2014
-EARLIEST_EPC_DATE = "2014-08-01"
-
-RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
-HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
-
-
-def make_cleaning_averages(df):
-    # Define a custom function to calculate the median, excluding missing values
-    def median_without_missing(group):
-        return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
-
-    cleaning_averages = df.groupby(
-        ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
-        observed=True
-    ).apply(median_without_missing).reset_index()
-
-    general_averages = df.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
-        median_without_missing).reset_index()
-
-    return cleaning_averages, general_averages
-
-
-def iterative_filtering(cleaning_averages, property_data):
-    # Define the columns to filter on
-    columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
-                         "NUMBER_HEATED_ROOMS"]
-
-    # Start with the entire cleaning_averages DataFrame
-    filtered_data = cleaning_averages.copy()
-
-    # Iterate through the columns and apply filters one by one
-    for column in columns_to_filter:
-        # Apply the filter using the value from property_data
-        new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]]
-
-        # If the filter results in no data, return the previous result
-        if new_filtered_data.empty:
-            continue
-
-        # If the filter is successful, update the filtered data
-        filtered_data = new_filtered_data
-
-    return filtered_data
-
-
-def clean_multi_glaze_proportion(df):
-    fully_glazed_descriptions = [
-        "Fully double glazed",
-        "High performance glazing",
-        "Fully triple glazed",
-        "Full secondary glazing",
-        "Multiple glazing throughout",
-    ]
-
-    df["MULTI_GLAZE_PROPORTION"] = np.where(
-        pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)),
-        100,
-        df["MULTI_GLAZE_PROPORTION"],
-    )
-
-    return df
-
-
-def ordinal(n):
-    if 10 <= n % 100 <= 20:
-        suffix = 'th'
-    else:
-        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
-
-    return str(n) + suffix
-
-
-FLOOR_LEVEL_MAP = {
-    "Basement": -1,
-    "Ground": 0,
-    "ground floor": 0,
-    "20+": 20,
-    "21st or above": 21,
-    **{str(i).zfill(2): i for i in range(0, 21)},
-    **{ordinal(i): i for i in range(-1, 21)},
-    **{str(i): i for i in range(-1, 21)},
-    **{i: i for i in range(-1, 21)},
-}
-
-BUILT_FORM_REMAP = {
-    "Enclosed End-Terrace": "End-Terrace",
-    "Enclosed Mid-Terrace": "Mid-Terrace",
-}
-
+DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'

 def app():
    # Get all the files in the directory
@ -173,108 +23,98 @@ def app():
    # Data glossary:
    # https://epc.opendatacommunities.org/docs/guidance#glossary

-    directories = list_subdirectories(DATA_DIRECTORY)
+    # List all subdirectories
+    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

    dataset = []
-    for directory in tqdm(directories):
-        filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
-        df = pd.read_csv(filepath, low_memory=False)
-        # UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
-        df = df[~pd.isnull(df["UPRN"])]
-        # Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
-        # before the introduction of SAP09
-        df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]

-        cleaning_averages, general_averages = make_cleaning_averages(df)
+    for directory in tqdm(directories): 

-        # We remove EPCS that were conducted for a new build, since these are performed with
-        # full SAP, which produces different results to the RdSAP methodology
-        df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
+        filepath = directory / "certificates.csv"

-        df = clean_multi_glaze_proportion(df)
+        data_processor = DataProcessor(filepath=filepath)

-        # We remove floor level in top floor or mid floor since this is ambiguous
-        df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
-
-        df["UPRN"] = df["UPRN"].astype(int).astype(str)
-        counts = df.groupby("UPRN").size().reset_index()
-        counts.columns = ["UPRN", "count"]
-        counts = counts.sort_values("count", ascending=False)
-
-        # take UPRNS with multiple EPCs
-        counts = counts[counts["count"] > 1]
-        df = df[df["UPRN"].isin(counts["UPRN"])]
-        df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+        df = data_processor.pre_process()
+        cleaning_averages = data_processor.make_cleaning_averages()

        for uprn, property_data in df.groupby("UPRN", observed=True):

            # Fixed features - these are property attributes that shouldn't change over time
-
-            ignore_epc = False
            fixed_data = {}
-            for field in FIXED_FEATURES:
-                vals = property_data[field].dropna().unique()
-                # Remove invalid values
-                vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]

-                if field == "FLOOR_LEVEL":
-                    vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
+            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
+            if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
+                continue

-                if field == "BUILT_FORM":
-                    vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals})
+            # Map all anomaly values to None
+            data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
+            
+            # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
+            modified_property_data = property_data.replace(data_anomaly_map)
+            modified_property_data = modified_property_data.replace(np.NAN, None)
+            
+            # Remap certain columns
+            modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
+            modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)

-                if field in AVERAGE_FIXED_FEATURES:
+            # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS 
+            latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
+            mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()

-                    if len(vals) > 1:
-                        # Check the values are too far apart
-                        if abs(vals[0] - vals[1]) / vals[0] > 0.1:
-                            # Take the more recent value since it's likely to be more accurate
-                            vals = [vals[-1]]
+            # Taking just the last row, which is the percentage change from the latest to previous one only
+            # modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1

-                    if vals:
-                        field_value = np.mean(vals)
-                    else:
-                        # Clean using averages
+            # We can replace any NA values for Average fixed features
+            # We have columns that we want to merge on, but some of these columns are all NA values
+            # So we determine which columns to merge on, and get the equivalent grouping in the averages
+            columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
+                         "NUMBER_HEATED_ROOMS"]
+            
+            if any(modified_property_data[columns_to_merge_on].isna()):
+                # If there are any NA value, back fill first (i.e most recent), then forward fill if needed
+                modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
+            
+            # Extract the columns that are not all None
+            na_columns = modified_property_data[columns_to_merge_on].isna().all()
+            columns_to_merge_on = na_columns.index[~na_columns].to_list()

-                        avgs = iterative_filtering(cleaning_averages, property_data)
-                        # TODO: Should probably do a mean/median?
-                        field_value = avgs[field].iloc[0]
+            #  Get the corresponding groupby and merge, and fill in NA values
+            cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
+            modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
+            modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
+            modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
+            modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])

-                        if pd.isnull(field_value):
-                            # Just the use the general averages
-                            field_value = general_averages[
-                                (general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
-                                (general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0])
-                                ][field].iloc[0]
-
-                elif field in LATEST_FIELD:
-                    field_value = vals[-1] if vals else None
-                else:
-                    if len(vals) > 1:
-                        if field in MANDATORY_FIXED_FEATURES:
-                            ignore_epc = True
-                        else:
-                            raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
-
-                    field_value = vals[0] if vals else None
+            for field in AVERAGE_FIXED_FEATURES:
+                vals =  list(modified_property_data[field].dropna().unique())
+                if len(vals) > 1:
+                    # Check the values are too far apart
+                    # TODO: we could have multiple values here, why only use the first two?
+                    if abs(vals[0] - vals[1]) / vals[0] > 0.1:
+                        # Take the more recent value since it's likely to be more accurate
+                        vals = [vals[-1]]

+                if vals:
+                    field_value = np.mean(vals)
+                
                fixed_data[field] = field_value

-            if ignore_epc:
-                continue
+            #Combine all fields together
+            fixed_data.update(mandatory_field_data)
+            fixed_data.update(latest_field_data)

            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
-            variable_data = property_data[
+            variable_data = modified_property_data[
                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
            # e.g. first vs second, second vs third and also first vs third
            property_model_data = []
-            for idx in range(0, property_data.shape[0] - 1):
+            for idx in range(0, modified_property_data.shape[0] - 1):

-                if idx >= property_data.shape[0] - 1:
+                if idx >= modified_property_data.shape[0] - 1:
                    break

                starting_record = variable_data.iloc[idx]
@ -307,3 +147,10 @@ def app():
                )

            dataset.extend(property_model_data)
+
+    output = pd.DataFrame(dataset)
+    output.to_parquet('./dataset.parquet')
+
+
+if __name__ == "__main__":
+    app()
--- a/model_data/simulation_system/energy_predictor.py
+++ b/model_data/simulation_system/energy_predictor.py
@ -0,0 +1,117 @@
+from pathlib import Path
+from Settings import (
+    RDSAP_RESPONSE, 
+    FLOOR_LEVEL_MAP, 
+    BUILT_FORM_REMAP,
+    EARLIEST_EPC_DATE, 
+    FULLY_GLAZED_DESCRIPTIONS,
+    FIXED_FEATURES,
+    LATEST_FIELD,
+    COMPONENT_FEATURES
+    )
+from model_data.BaseUtility import BaseUtility
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+
+from autogluon.tabular import TabularDataset, TabularPredictor
+
+RANDOM_SEED = 0
+
+DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
+
+FLOAT_COLUMNS = [
+            'NUMBER_OPEN_FIREPLACES', 
+            'EXTENSION_COUNT',
+            'TOTAL_FLOOR_AREA',
+            'PHOTO_SUPPLY',
+            'FIXED_LIGHTING_OUTLETS_COUNT',
+            'FLOOR_HEIGHT',
+            'NUMBER_HABITABLE_ROOMS',
+            'LOW_ENERGY_LIGHTING',
+            'MULTI_GLAZE_PROPORTION',
+            'NUMBER_HEATED_ROOMS'
+        ]
+
+def create_raw_data():
+    """
+    Extract all information to do a simple predictor for RDSAP
+    """
+
+    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
+    # directories = directories[0:10]
+    dfs = [] 
+    for directory in tqdm(directories):
+        filepath = directory / "certificates.csv"
+        df = pd.read_csv(filepath, low_memory=False)
+
+        # Remove any bad uprns and ignore old/bad data
+        df = df[~pd.isnull(df["UPRN"])]
+        df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
+        df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
+        df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
+
+        # Change multi glaze proportion
+        no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
+        df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
+
+        # Recast 
+        df["UPRN"] = df["UPRN"].astype(int).astype(str)
+        df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float)
+
+        # Sort Data
+        df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+
+        # Map all anomaly values to None
+        data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
+        
+        # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
+        df = df.replace(data_anomaly_map)
+        df = df.replace(np.NAN, None)
+        
+        # Remap certain columns
+        df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
+        df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP)
+
+        # Keep only possible modelling columns
+        df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))]
+
+        # Reduce memory usage
+
+        # df.memory_usage()
+        # df.dtypes
+        df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned')
+        df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float')
+
+
+        dfs.append(df)
+
+    data = pd.concat(dfs)
+    data.to_parquet('./energy_predictor_data.parquet')
+
+    cleaned_data = data.dropna()
+    # GIves you primarily flats
+    cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet')
+
+
+def main():
+
+    data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet')
+
+    subsample_size = round(len(data)/100)
+    data = data.sample(subsample_size, random_state=RANDOM_SEED)
+
+    predictor_RDSAP = TabularPredictor(
+        label=RDSAP_RESPONSE, 
+        path="agModels-predictENERGY", 
+        problem_type="regression",
+        eval_metric='mean_absolute_error'
+        ).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT'])
+
+    test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet')
+    performance = predictor_RDSAP.evaluate(test_data)
+    predictions = predictor_RDSAP.predict(test_data)
+    predictor_RDSAP.feature_importance(test_data)
+
+if __name__ == "__main__":
+    main()
--- a/model_data/simulation_system/preprocessed_data/dataset.parquet
+++ b/model_data/simulation_system/preprocessed_data/dataset.parquet
--- a/model_data/simulation_system/test_data_generation.py
+++ b/model_data/simulation_system/test_data_generation.py
@ -0,0 +1,77 @@
+from Logger import logger
+import argparse
+import pandas as pd
+from pathlib import Path
+
+RANDOM_SEED = 0
+
+def ingest_arguments() -> argparse.Namespace:
+    """
+    Helper function to take in arguments from script start
+    """
+
+    parser = argparse.ArgumentParser(description='Inputs for training script')
+
+    parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load', required=True)
+    parser.add_argument('--output-folder', type=str, help='Location of Parquet dataset to save', required=True)
+    parser.add_argument('--percentage', type=float, help='Percentage of data to use as test data', default=None)
+    parser.add_argument('--volume', type=int, help='Volume of data to use as test data', default=None)
+    parser.add_argument('--sampling', type=str, help='Type of sampling to do for test data', choices=['random', 'stratified'], default='random')
+
+    args = parser.parse_args()
+
+    return args
+
+def main(filepath: str, output_folder: str, percentage: float, volume: int, sampling: str):
+    """
+    Load a dataset in and split out the training+validation data and the test data.
+    """
+
+    logger.info('---Loading Data---')
+    data = pd.read_parquet(filepath).reset_index(drop=True)
+
+    if percentage and volume is None:
+        test_amount = round(len(data)*percentage)
+    elif percentage is None and volume:
+        test_amount = volume
+    elif percentage is None and volume is None:
+        logger.error('No amount specified - please specify either a percentage or volume')
+        exit(1)
+    else:
+        logger.info('Both percentage and volume specified - taking largest of the two')
+        test_amount = max(round(len(data)*percentage), volume)
+
+    logger.info(f'---Extracting {test_amount} from dataset to be test data')
+
+    if sampling == 'random':
+        logger.info('--- Using random sample method ---')
+        sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index
+
+        train_validation_data = data.drop(sample_index)
+        test_data = data.iloc[sample_index]
+
+    elif sampling =='stratified':
+        # Not yet implemented 
+        pass
+
+    logger.info('--- Saving data ---')
+
+    train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
+    test_data.to_parquet(Path(output_folder)/'test_data.parquet')
+
+    logger.info(' ---Pipeline complete---')
+
+if __name__ == "__main__":
+
+    logger.info('--- Generate test data pipeline ---')
+
+    args = ingest_arguments()
+
+    main(
+        filepath=args.filepath, 
+        output_folder=args.output_folder,
+        percentage=args.percentage, 
+        volume=args.volume, 
+        sampling=args.sampling
+        )
+
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@ -0,0 +1,131 @@
+import os
+import pandas as pd
+import argparse
+from typing import List
+from Logger import logger
+from autogluon.tabular import TabularDataset, TabularPredictor
+
+
+DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
+FEATURE_COLUMNS = None
+RANDOM_SEED = 0
+
+# FOR TESTING
+train_filepath = "./model_build_data/train_validation_data.parquet"
+test_filepath = "./model_build_data/test_data.parquet"
+
+
+def ingest_arguments() -> argparse.Namespace:
+    """
+    Helper function to take in arguments from script start
+    """
+
+    parser = argparse.ArgumentParser(description='Inputs for training script')
+
+    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
+    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
+
+    args = parser.parse_args()
+
+    return args
+
+
+class DataLoader():
+
+    @staticmethod
+    def load(filepath: str) -> pd.DataFrame:
+        """
+        Load different datasets
+        """
+        if filepath.endswith('.parquet'):
+            df = pd.read_parquet(filepath)
+        elif filepath.endswith('.csv.'):
+            df = pd.read_csv(filepath)
+        else:
+            logger.error('Not implemented!')
+            exit(1)
+
+        return df
+    
+class FeatureProcessor:
+    """
+    Handle all feature manipulation before modelling
+    """
+    
+    @staticmethod
+    def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
+        df = df.drop(columns=[drop_columns])
+        return df
+
+    def retain_features(df: pd.DataFrame, features: List[str] = None):
+        """
+        Determine which columns to keep ofr modelling
+        """
+        if features is None:
+            features = df.columns
+        else:
+            if not set(features).issubset(df.columns):
+                logger.error('Features defined is not contained in data')
+                exit(1)
+        
+        df = df[features]
+
+        return df
+    
+    def process(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
+        df = self.retain_features(df, features=FEATURE_COLUMNS)
+        return df
+
+            
+
+def training(train_filepath: str, test_filepath: str) -> None:
+    """
+    Pipeline to run training on the dataset
+    """
+
+    logger.info('Loading data')
+    dataloader = DataLoader()
+    train_df = dataloader.load(filepath=train_filepath)
+    test_df = dataloader.load(filepath=test_filepath)
+
+    # df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
+ 
+    logger.info('Feature processing')
+    feature_processor = FeatureProcessor()
+    train_df = feature_processor.process(train_df)
+    test_df = feature_processor.process(test_df)
+
+    # logger.info('Split data into train and validation')
+
+    logger.info('Build Model')
+    data = TabularDataset(data=train_df)
+    # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
+    subsample_size = round(len(data)/4)
+    data = data.sample(subsample_size, random_state=RANDOM_SEED)
+
+    target_column = 'RDSAP_CHANGE'
+    predictor_RDSAP = TabularPredictor(
+        label=target_column, 
+        path="agModels-predictRDSAP", 
+        problem_type="regression",
+        eval_metric='mean_absolute_error'
+        ).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
+
+    logger.info('Evaluate matrics')
+
+    test_data = TabularDataset('./model_build_data/test_data.parquet')
+    performance = predictor_RDSAP.evaluate(test_data)
+    predictions = predictor_RDSAP.predict(test_data)
+
+    test_data['predictions'] = predictions
+    test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
+
+if __name__ == "__main__":
+
+    logger.info('---Begin Pipeline---')
+
+    logger.info('---Ingest Arguments---')
+    args = ingest_arguments()
+
+    training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)