Add seperated class for Dataprocessing - need to move all code into it eventually

2026-07-27 23:35:01 +00:00 · 2023-08-12 11:35:01 +00:00 · 2023-08-12 11:35:01 +00:00 · 92736084e8
commit 92736084e8
parent 9fe640dd50
4 changed files with 319 additions and 260 deletions
--- a/model_data/simulation_system/DataProcessor.py
+++ b/model_data/simulation_system/DataProcessor.py
@ -0,0 +1,142 @@
+from pathlib import Path
+import pandas as pd
+from settings import (
+    DATA_PROCESSOR_SETTINGS,
+    EARLIEST_EPC_DATE,
+    FULLY_GLAZED_DESCRIPTIONS,
+    AVERAGE_FIXED_FEATURES,
+    FLOOR_HEIGHT_NATIONAL_AVERAGE,
+    TOTAL_FLOOR_AREA_NATIONAL_AVERAGE
+    )
+
+
+class DataProcessor:
+    """
+    Handle data loading and data preprocessing
+    """
+
+    def __init__(self, filepath: Path) -> None:
+        self.filepath = filepath
+
+    def load_data(self, low_memory=False) -> None:
+        self.data = pd.read_csv(self.filepath, low_memory=low_memory)
+
+    def pre_process(self) -> pd.DataFrame:
+        """
+        Load data and begin initial cleaning
+        """
+        self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
+        self.confine_data()
+        self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
+        self.clean_multi_glaze_proportion()
+        self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
+        
+        self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+
+        return self.data
+    
+    def make_cleaning_averages(self) -> pd.DataFrame:
+        # Define a custom function to calculate the median, excluding missing values
+        def median_without_missing(group):
+            return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
+
+        cleaning_averages = self.data.groupby(
+            ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+            observed=True
+        ).apply(median_without_missing).reset_index()
+
+        general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
+            median_without_missing).reset_index()
+        
+        property_averages = self.data.groupby(["PROPERTY_TYPE"], observed=True).apply(
+            median_without_missing).reset_index()
+        
+        built_form_averages = self.data.groupby(["BUILT_FORM"], observed=True).apply(
+            median_without_missing).reset_index()
+        
+        # We can clean up any NA's in the cleaning averages with the general averages here
+        cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], suffixes=['', '_AVERAGE'])
+        cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], suffixes=['', '_PROPERTY_AVERAGE'])
+        cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], suffixes=['', '_BUILT_FORM_AVERAGE'])
+
+        # Replace any missing NAN values with averages for the same Property type and built form
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE'])
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE'])
+        cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
+
+        #  If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope and built form
+        #  We can use just the property type average and replace
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE'])
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE'])
+        cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE'])
+
+        # If there are still NA values, use BUILT FORM averages
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE'])
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
+        cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
+
+        # If there still is na values, use average across all properties in consituecy
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean())
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT'].mean())
+
+        # If the consituency is all NA values, then take UK AVERAGE VALUES
+        cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
+        cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
+
+        return cleaning_averages_filled
+
+    def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
+        '''
+        Reduce the data futher by keeping only datasets with multiple epcs
+        '''
+
+        counts = self.data.groupby("UPRN").size().reset_index()
+        counts.columns = ["UPRN", "count"]
+
+        # take UPRNS with multiple EPCs
+        counts = counts[counts["count"] > epc_minimum_count]
+        self.data = pd.merge(self.data, counts, on='UPRN')
+
+    
+    def recast_df_columns(self, column_mappings: dict) -> None:
+        """
+        Recast columns from the dataframe to ensure the behaviour we want
+        """
+
+        for key, values in column_mappings.items():
+            if key not in self.data.columns:
+                print('Column mapping incorrectly specified')
+                exit(1)
+            for value in values:
+                self.data[key] = self.data[key].astype(value)
+
+
+    def confine_data(self) -> None:
+        """
+        Include all step to reduce down the data based on assumptions
+        """
+
+        # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
+
+        # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
+        # before the introduction of SAP09
+
+        # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
+        # full SAP, which produces different results to the RdSAP methodology
+
+        # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
+
+        self.data = self.data[~pd.isnull(self.data["UPRN"])]
+        self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
+        self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
+        self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
+
+    
+    def clean_multi_glaze_proportion(self) -> None:
+        """
+        If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
+        """
+
+        no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
+        self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
+
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@ -1,269 +1,42 @@
 import numpy as np
-import os
 import pandas as pd
 from tqdm import tqdm
 from model_data.BaseUtility import BaseUtility
 from pathlib import Path
-from typing import Tuple
-
-def list_subdirectories(directory_path):
-    return [entry for entry in directory_path.iterdir() if entry.is_dir()]
+from settings import (
+    MANDATORY_FIXED_FEATURES,
+    AVERAGE_FIXED_FEATURES, 
+    LATEST_FIELD, 
+    COMPONENT_FEATURES, 
+    RDSAP_RESPONSE,
+    HEAT_DEMAND_RESPONSE,
+    FLOOR_LEVEL_MAP,
+    BUILT_FORM_REMAP
+)
+from DataProcessor import DataProcessor

 DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'

-FULLY_GLAZED_DESCRIPTIONS = [
-    "Fully double glazed",
-    "High performance glazing",
-    "Fully triple glazed",
-    "Full secondary glazing",
-    "Multiple glazing throughout",
-]
-
-FIXED_FEATURES = [
-    'PROPERTY_TYPE',
-    'BUILT_FORM',
-    'CONSTRUCTION_AGE_BAND',
-    'NUMBER_HABITABLE_ROOMS',
-    'CONSTITUENCY',
-    'NUMBER_HEATED_ROOMS',
-    'FIXED_LIGHTING_OUTLETS_COUNT',
-    'FLOOR_HEIGHT',
-    'FLOOR_LEVEL',
-    'TOTAL_FLOOR_AREA',
-]
-
-COMPONENT_FEATURES = [
-    'TRANSACTION_TYPE',
-    'WALLS_DESCRIPTION',
-    'FLOOR_DESCRIPTION',
-    'LIGHTING_DESCRIPTION',
-    'ROOF_DESCRIPTION',
-    'MAINHEAT_DESCRIPTION',
-    'HOTWATER_DESCRIPTION',
-    'MAIN_FUEL',
-    'MECHANICAL_VENTILATION',
-    'SECONDHEAT_DESCRIPTION',
-    'ENERGY_TARIFF',  # Not sure if this is relevant
-    'SOLAR_WATER_HEATING_FLAG',
-    'PHOTO_SUPPLY',
-    'WINDOWS_DESCRIPTION',
-    'GLAZED_TYPE',
-    'MULTI_GLAZE_PROPORTION',
-    'LIGHTING_DESCRIPTION',
-    'LOW_ENERGY_LIGHTING',
-    'NUMBER_OPEN_FIREPLACES',
-    'MAINHEATCONT_DESCRIPTION',
-    'EXTENSION_COUNT',
-    # 'GLAZED_AREA',  # May not need this since we have MULTI_GLAZE_PROPORTION
-]
-
-# For these fields, we take an average if we have multiple values
-AVERAGE_FIXED_FEATURES = [
-    "TOTAL_FLOOR_AREA",
-    "FLOOR_HEIGHT"
-]
-
-# For these fields, we take the latest value if we have multiple values
-# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
-# the most accurate
-LATEST_FIELD = [
-    "NUMBER_HABITABLE_ROOMS",
-    "NUMBER_HEATED_ROOMS",
-    "FIXED_LIGHTING_OUTLETS_COUNT",
-    "FLOOR_LEVEL",
-    "CONSTRUCTION_AGE_BAND",  # This is a field we're probably want to use verisk data for
-]
-
-# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
-MANDATORY_FIXED_FEATURES = [
-    "PROPERTY_TYPE",
-    "BUILT_FORM",
-    "CONSTITUENCY"
-]
-
-# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
-# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
-# and Wales from 31 July 2014
-EARLIEST_EPC_DATE = "2014-08-01"
-
-RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
-HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
-
-
-def iterative_filtering(cleaning_averages, property_data):
-
-
-    # Define the columns to filter on
-    columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
-                         "NUMBER_HEATED_ROOMS"]
-    
-    # Merge datasets together on columns
-    filtered_data = pd.merge(cleaning_averages, property_data.iloc[[-1]], on=columns_to_filter)
-
-    # # Start with the entire cleaning_averages DataFrame
-    # filtered_data = cleaning_averages.copy()
-
-    # # Iterate through the columns and apply filters one by one
-    # for column in columns_to_filter:
-    #     # Apply the filter using the value from property_data
-    #     new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]]
-
-    #     # If the filter results in no data, return the previous result
-    #     if new_filtered_data.empty:
-    #         continue
-
-    #     # If the filter is successful, update the filtered data
-    #     filtered_data = new_filtered_data
-
-    return filtered_data
-
-
-def ordinal(n):
-    if 10 <= n % 100 <= 20:
-        suffix = 'th'
-    else:
-        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
-
-    return str(n) + suffix
-
-
-FLOOR_LEVEL_MAP = {
-    "Basement": -1,
-    "Ground": 0,
-    "ground floor": 0,
-    "20+": 20,
-    "21st or above": 21,
-    **{str(i).zfill(2): i for i in range(0, 21)},
-    **{ordinal(i): i for i in range(-1, 21)},
-    **{str(i): i for i in range(-1, 21)},
-    **{i: i for i in range(-1, 21)},
-}
-
-BUILT_FORM_REMAP = {
-    "Enclosed End-Terrace": "End-Terrace",
-    "Enclosed Mid-Terrace": "Mid-Terrace",
-}
-
-DATA_PROCESSOR_SETTINGS = {
-    'low_memory': False,
-    'epc_minimum_count': 1,
-    'column_mappings': {'UPRN': [int, str]}
-}
-
-class DataProcessor:
-    """
-    Handle data loading and data preprocessing
-    """
-
-    def __init__(self, filepath: Path) -> None:
-        self.filepath = filepath
-
-    def load_data(self, low_memory=False) -> None:
-        self.data = pd.read_csv(self.filepath, low_memory=low_memory)
-
-    def process(self) -> pd.DataFrame:
-        """
-        Load all data adnd process data via composition
-        """
-        self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
-        self.confine_data()
-        self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
-        self.clean_multi_glaze_proportion()
-        self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
-        
-        self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
-
-        return self.data
-    
-    def make_cleaning_averages(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        # Define a custom function to calculate the median, excluding missing values
-        def median_without_missing(group):
-            return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
-
-        cleaning_averages = self.data.groupby(
-            ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
-            observed=True
-        ).apply(median_without_missing).reset_index()
-
-        general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
-            median_without_missing).reset_index()
-
-        return cleaning_averages, general_averages
-
-    def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
-        '''
-        Reduce the data futher by keeping only datasets with multiple epcs
-        '''
-
-        counts = self.data.groupby("UPRN").size().reset_index()
-        counts.columns = ["UPRN", "count"]
-
-        # take UPRNS with multiple EPCs
-        counts = counts[counts["count"] > epc_minimum_count]
-        self.data = pd.merge(self.data, counts, on='UPRN')
-
-    
-    def recast_df_columns(self, column_mappings: dict) -> None:
-        """
-        Recast columns from the dataframe to ensure the behaviour we want
-        """
-
-        for key, values in column_mappings.items():
-            if key not in self.data.columns:
-                print('Column mapping incorrectly specified')
-                exit(1)
-            for value in values:
-                self.data[key] = self.data[key].astype(value)
-
-
-    def confine_data(self) -> None:
-        """
-        Include all step to reduce down the data based on assumptions
-        """
-
-        # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
-
-        # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
-        # before the introduction of SAP09
-
-        # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
-        # full SAP, which produces different results to the RdSAP methodology
-
-        # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
-
-        self.data = self.data[~pd.isnull(self.data["UPRN"])]
-        self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
-        self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
-        self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
-
-    
-    def clean_multi_glaze_proportion(self) -> None:
-        """
-        If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
-        """
-
-        no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
-        self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
-
-
-
 def app():
    # Get all the files in the directory

    # Data glossary:
    # https://epc.opendatacommunities.org/docs/guidance#glossary

-    directories = list_subdirectories(DATA_DIRECTORY)
+    # List all subdirectories
+    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

    dataset = []
+
+
    for directory in tqdm(directories):

        filepath = directory / "certificates.csv"

        data_processor = DataProcessor(filepath=filepath)

-        df = data_processor.process()
-        cleaning_averages, general_averages = data_processor.make_cleaning_averages()
+        df = data_processor.pre_process()
+        cleaning_averages = data_processor.make_cleaning_averages()

        for uprn, property_data in df.groupby("UPRN", observed=True):

@ -280,44 +53,51 @@ def app():
            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
            if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
                continue
-
-            mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
            
            # Remap certain columns
            modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
            modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)

+            # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS 
            latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
+            mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()

            # Taking just the last row, which is the percentage change from the latest to previous one only
            # modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1

+            # We can replace any NA values for Average fixed features
+            # We have columns that we want to merge on, but some of these columns are all NA values
+            # So we determine which columns to merge on, and get the equivalent grouping in the averages
+            columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
+                         "NUMBER_HEATED_ROOMS"]
+            
+            if any(modified_property_data[columns_to_merge_on].isna()):
+                # If there are any NA value, back fill first (i.e most recent), then forward fill if needed
+                modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
+            
+            # Extract the columns that are non all None
+            na_columns = modified_property_data[columns_to_merge_on].isna().all()
+            columns_to_merge_on = na_columns.index[~na_columns].to_list()

+            #  Get the corresponding groupby and merge, and fill in NA values
+            cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
+            modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
+            modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
+            modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
+            modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])

            for field in AVERAGE_FIXED_FEATURES:
                vals =  list(modified_property_data[field].dropna().unique())
                if len(vals) > 1:
                    # Check the values are too far apart
+                    # TODO: we could have multiple values here, why only use the first two?
                    if abs(vals[0] - vals[1]) / vals[0] > 0.1:
                        # Take the more recent value since it's likely to be more accurate
                        vals = [vals[-1]]

                if vals:
                    field_value = np.mean(vals)
-                else:
-                    # Clean using averages
-
-                    avgs = iterative_filtering(cleaning_averages, modified_property_data)
-                    # TODO: Should probably do a mean/median?
-                    field_value = avgs[field].iloc[0]
-
-                    if pd.isnull(field_value):
-                        # Just the use the general averages
-                        field_value = general_averages[
-                            (general_averages["PROPERTY_TYPE"] == modified_property_data["PROPERTY_TYPE"].iloc[0]) &
-                            (general_averages["BUILT_FORM"] == modified_property_data["BUILT_FORM"].iloc[0])
-                            ][field].iloc[0]
-                        
+                
                fixed_data[field] = field_value

            #Combine all fields together
@ -369,6 +149,9 @@ def app():

            dataset.extend(property_model_data)

+    output = pd.DataFrame(dataset)
+    output.to_parquet('./dataset.parquet')
+

 if __name__ == "__main__":
    app()
--- a/model_data/simulation_system/settings.py
+++ b/model_data/simulation_system/settings.py
@ -0,0 +1,114 @@
+# Using a simply python file as settings for now 
+# TODO: migrate to dynaconf
+
+TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
+FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
+
+FULLY_GLAZED_DESCRIPTIONS = [
+    "Fully double glazed",
+    "High performance glazing",
+    "Fully triple glazed",
+    "Full secondary glazing",
+    "Multiple glazing throughout",
+]
+
+FIXED_FEATURES = [
+    'PROPERTY_TYPE',
+    'BUILT_FORM',
+    'CONSTRUCTION_AGE_BAND',
+    'NUMBER_HABITABLE_ROOMS',
+    'CONSTITUENCY',
+    'NUMBER_HEATED_ROOMS',
+    'FIXED_LIGHTING_OUTLETS_COUNT',
+    'FLOOR_HEIGHT',
+    'FLOOR_LEVEL',
+    'TOTAL_FLOOR_AREA',
+]
+
+COMPONENT_FEATURES = [
+    'TRANSACTION_TYPE',
+    'WALLS_DESCRIPTION',
+    'FLOOR_DESCRIPTION',
+    'LIGHTING_DESCRIPTION',
+    'ROOF_DESCRIPTION',
+    'MAINHEAT_DESCRIPTION',
+    'HOTWATER_DESCRIPTION',
+    'MAIN_FUEL',
+    'MECHANICAL_VENTILATION',
+    'SECONDHEAT_DESCRIPTION',
+    'ENERGY_TARIFF',  # Not sure if this is relevant
+    'SOLAR_WATER_HEATING_FLAG',
+    'PHOTO_SUPPLY',
+    'WINDOWS_DESCRIPTION',
+    'GLAZED_TYPE',
+    'MULTI_GLAZE_PROPORTION',
+    'LIGHTING_DESCRIPTION',
+    'LOW_ENERGY_LIGHTING',
+    'NUMBER_OPEN_FIREPLACES',
+    'MAINHEATCONT_DESCRIPTION',
+    'EXTENSION_COUNT',
+    # 'GLAZED_AREA',  # May not need this since we have MULTI_GLAZE_PROPORTION
+]
+
+# For these fields, we take an average if we have multiple values
+AVERAGE_FIXED_FEATURES = [
+    "TOTAL_FLOOR_AREA",
+    "FLOOR_HEIGHT"
+]
+
+# For these fields, we take the latest value if we have multiple values
+# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
+# the most accurate
+LATEST_FIELD = [
+    "NUMBER_HABITABLE_ROOMS",
+    "NUMBER_HEATED_ROOMS",
+    "FIXED_LIGHTING_OUTLETS_COUNT",
+    "FLOOR_LEVEL",
+    "CONSTRUCTION_AGE_BAND",  # This is a field we're probably want to use verisk data for
+]
+
+# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
+MANDATORY_FIXED_FEATURES = [
+    "PROPERTY_TYPE",
+    "BUILT_FORM",
+    "CONSTITUENCY"
+]
+
+# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
+# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
+# and Wales from 31 July 2014
+EARLIEST_EPC_DATE = "2014-08-01"
+
+RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
+HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
+
+def ordinal(n):
+    if 10 <= n % 100 <= 20:
+        suffix = 'th'
+    else:
+        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
+
+    return str(n) + suffix
+
+FLOOR_LEVEL_MAP = {
+    "Basement": -1,
+    "Ground": 0,
+    "ground floor": 0,
+    "20+": 20,
+    "21st or above": 21,
+    **{str(i).zfill(2): i for i in range(0, 21)},
+    **{ordinal(i): i for i in range(-1, 21)},
+    **{str(i): i for i in range(-1, 21)},
+    **{i: i for i in range(-1, 21)},
+}
+
+BUILT_FORM_REMAP = {
+    "Enclosed End-Terrace": "End-Terrace",
+    "Enclosed Mid-Terrace": "Mid-Terrace",
+}
+
+DATA_PROCESSOR_SETTINGS = {
+    'low_memory': False,
+    'epc_minimum_count': 1,
+    'column_mappings': {'UPRN': [int, str]}
+}
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@ -0,0 +1,20 @@
+import os
+from logging import Logger
+
+logger = Logger(__name__)
+
+def training():
+    """
+    Pipeline to run training on the dataset
+    """
+
+    logger.info('Loading data')
+
+    logger.info('Feature selection')
+
+    logger.info('Build Model')
+
+    logger.info('Evaluate matrics')
+
+if __name__ == "__main__":
+    training()