Merge pull request #237 from Hestia-Homes/spatial-data

Spatial data
2026-06-30 13:10:47 +00:00 · 2023-10-11 12:32:37 +08:00 · 2023-10-11 12:32:37 +08:00 · b2142a7f8e
commit b2142a7f8e
parent 642a224a7b 5d0d0825b0
154 changed files with 1977 additions and 13742 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -2,12 +2,8 @@
 omit =
    *__init__*
    */tests/*
    model_data/temp_inputs.py
    model_data/config.py
    model_data/__init__.py
    model_data/app.py
    model_data/plotting/*
    recommendations/rdsap_tables.py
-    model_data/simulation_system/*
+    */config.py
-    model_data/cleaner_app.py
+    */app.py
    */settings.py
    backend/app/*
--- a/.github/workflows/deploy_sap_model_lambda.yml
+++ b/.github/workflows/deploy_sap_model_lambda.yml
@ -1,81 +0,0 @@
 name: Sap Model Deploy
 on:
  push:
    branches: [ dev, prod ]
 jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: 3.10.12
      - name: Install Serverless and plugins
        run: |
          npm install -g serverless
          npm install -g serverless-domain-manager
      - name: AWS credentials for dev
        if: github.ref == 'refs/heads/dev'
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
          aws-region: eu-west-2
      - name: AWS credentials for prod
        if: github.ref == 'refs/heads/prod'
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
          aws-region: eu-west-2
      - name: Set domain name
        id: set_domain
        run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
      - name: Set ECR credentials
        id: set_ecr_credentials
        run: |
          echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
      - name: Setup Docker
        uses: docker/setup-buildx-action@v1
      - name: Login to ECR
        run: |
          aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
      # Building and pushing Docker image with caching
      - name: Build and push Docker image
        uses: docker/build-push-action@v3
        with:
          context: ./model_data/simulation_system
          file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
          push: true
          tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
          platform: linux/amd64
          provenance: false
      - name: Deploy to AWS Lambda via Serverless
        env:
          RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
          MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
          PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
          DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
          DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
          ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
          GITHUB_SHA: ${{ github.sha }}
        run: |
          # Deploy to AWS Lambda via Serverless
          sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose
--- a/.gitignore
+++ b/.gitignore
@ -239,7 +239,8 @@ fabric.properties
 .idea/caches/build_file_checksums.ser
 # Locally stored data
-/model_data/local_data/*
+local_data/*
 /local_data/*
 *.DS_Store
 infrastructure/terraform/.terraform*
@ -261,3 +262,6 @@ model_data/simulation_system/predictions/
 .idea/Model.iml
 .idea/misc.iml
 adhoc
 adhoc/*
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,14 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
    <option name="namespacePackageFolders">
      <list>
        <option value="$MODULE_DIR$/local_data" />
      </list>
    </option>
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/model_data/BaseUtility.py
+++ b/model_data/BaseUtility.py
@ -43,7 +43,9 @@ class Definitions:
        # contained within the first of these multiple entries is being provided. As there are no restrictions on the 
        # value in this first field it means that sometimes the first field in a multiple entry description field may 
        # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
-        "NULL"
+        "NULL",
        # We sometimes see fields populated with just an empty string.
        ""
    }
    DATA_ANOMALY_SUBSTRINGS = {
--- a/backend/Property.py
+++ b/backend/Property.py
@ -1,9 +1,22 @@
 from datetime import datetime
 import re
 import os
 import pandas as pd
 from etl.epc.DataProcessor import DataProcessor
 from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
 from utils.logger import setup_logger
 from utils.s3 import read_dataframe_from_s3_parquet
 from epc_api.client import EpcClient
-from model_data.config import EPC_AUTH_TOKEN
+from BaseUtility import Definitions
 from model_data.BaseUtility import Definitions
 from recommendations.rdsap_tables import england_wales_age_band_lookup
 from recommendations.recommendation_utils import estimate_floors, estimate_perimeter, get_wall_type, estimate_wall_area
 ENVIRONMENT = os.environ.get('ENVIRONMENT', 'dev')
 EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
 DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT == 'dev' else None)
 logger = setup_logger()
 class Property(Definitions):
@ -30,17 +43,27 @@ class Property(Definitions):
    lighting = None
    coordinates = None
    age_band = None
    def __init__(self, id, postcode, address1, epc_client=None, data=None):
        self.id = id
        self.postcode = postcode
        self.address1 = address1
        self.data = data
        self.old_data = None
        self.property_dimensions = None
        self.uprn = None
        self.full_sap_epc = None
-        self.in_conservation_area = None
+        self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
        self.restricted_measures = False
        self.year_built = None
        self.number_of_rooms = None
        self.age_band = None
        self.construction_age_band = None
        self.number_of_floors = None
        self.perimeter = None
        self.wall_type = None
        self.floor_type = None
        self.energy = None
        self.ventilation = None
@ -83,9 +106,14 @@ class Property(Definitions):
            ]
            if len(newest_response) > 1:
                raise Exception("More than one result found for this address - investigate me")
            # We'll keep old EPCs in case it contains information, not present on the newest one
            self.old_data = [epc for epc in response["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]]
            response["rows"] = newest_response
        self.data = response["rows"][0]
        self.uprn = int(self.data["uprn"])
    def set_coordinates(self, coordinates):
        """
@ -127,7 +155,7 @@ class Property(Definitions):
        """
        ventilation = self.data["mechanical-ventilation"]
-        # perform some simple cleaning - when checking 300k properties, the only unique values were
+        # perform some simple cleaning - when checking 300k epc, the only unique values were
        # {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
        if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
            ventilation = None
@ -145,7 +173,7 @@ class Property(Definitions):
        - solar_pv
        This is based on the "photo-supply" field in the EPC data.
-        When checking 100k properties, either the value was "" or a stringified number
+        When checking 100k epc, either the value was "" or a stringified number
        """
        solar_pv = self.data["photo-supply"]
@ -244,11 +272,10 @@ class Property(Definitions):
        self.set_count_variables()
        self.set_heat_loss_corridor()
        self.set_mains_gas()
        self.set_floor_height()
        self.set_wall_area()
        self.set_floor_area()
        self.set_age_band()
        self.set_basic_property_dimensions()
        for description, attribute in cleaned.items():
            if self.data[description] in self.DATA_ANOMALY_MATCHES:
@ -262,10 +289,19 @@ class Property(Definitions):
            attributes = [
                x for x in cleaned[description] if x["original_description"] == self.data[description]
            ]
-            if len(attributes) != 1:
+            if len(attributes) > 1:
                raise ValueError("Either No attributes or multiple found for %s" % description)
            if len(attributes) == 0:
                # We attempt to perform the clean on the fly
                cleaner_cls = all_cleaner_map[description]
                attributes = [cleaner_cls(self.data[description]).process()]
            setattr(self, self.ATTRIBUTE_MAP[description], attributes[0])
        self.set_wall_type()
        self.set_floor_type()
    def set_age_band(self):
        """
        Sets a cleaned version of the age band of the property given the EPC data
@ -275,14 +311,20 @@ class Property(Definitions):
        if not self.data:
            raise ValueError("Property does not contain data")
-        self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
+        self.construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
        self.age_band = england_wales_age_band_lookup.get(self.construction_age_band)
-    def set_is_in_conservation_area(self, in_conservation_area):
+    def set_spatial(self, spatial: pd.DataFrame):
        """
        Sets whether the property is in a conservation area given the output of the ConservationAreaClient
-        :param in_conservation_area:  string value, indicating whether the property is in a conservation area
+        :param spatial:  Dataframe, containing the spatial data for the property
        """
-        self.in_conservation_area = in_conservation_area
+        self.in_conservation_area = spatial["conservation_status"].values[0]
        self.is_listed = spatial["is_listed_building"].values[0]
        self.is_heritage = spatial["is_heritage_building"].values[0]
        if self.in_conservation_area is True | self.is_listed is True | self.is_heritage is True:
            self.restricted_measures = True
    def set_year_built(self):
        """
@ -349,17 +391,6 @@ class Property(Definitions):
        else:
            self.mains_gas = map[self.data["mains-gas-flag"]]
    def set_floor_height(self):
        """
        Sets the floor height of the property
        :return:
        """
        if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
            self.floor_height = None
        else:
            self.floor_height = float(self.data["floor-height"])
    def _clean_upload_data(self, to_update):
        for k, v in to_update.items():
            if v in self.DATA_ANOMALY_MATCHES:
@ -443,21 +474,210 @@ class Property(Definitions):
        return property_details_epc
-    def set_wall_area(self):
+    def get_spatial_data(self, uprn_filenames):
        """
        This method is placeholder
        It implements our floor area model to produce an estimate of the property's insulatable wall area
        """
        import random
        self.insulation_wall_area = random.uniform(60, 100)
    def set_floor_area(self):
        """
        Sets the floor area based on the EPC data
        """
-        # We don't know the number of floors at the moment so we're going to assume 1
+        Given a property's UPRN, this method will pull the associated spatial data from s3
-        # however this is something we'll need to use Verisk data for
+        :return:
        """
        if self.uprn is None:
            raise ValueError("URPN is not set, run search_address_epc")
        # We get the file name for the uprn
        filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
        if filtered_df.empty:
            logger.warning("Could not find file containing UPRNS")
            return None
        filename = filtered_df.iloc[0]['filenames']
        spatial_data = read_dataframe_from_s3_parquet(
            bucket_name=DATA_BUCKET, file_key=f"spatial/{filename}"
        )
        spatial = spatial_data[spatial_data["UPRN"] == self.uprn]
        # Pull out spatial features
        self.set_spatial(spatial)
    def _filter_property_dimensions(self, property_dimensions):
        """
        Will filter the property dimensions dataframe to only include the relevant rows for the property
        :param property_dimensions:
        :return: filtered property dimensions dataframe
        """
        result = property_dimensions[(property_dimensions["PROPERTY_TYPE"] == self.data["property-type"])]
        if self.construction_age_band is not None and self.construction_age_band not in self.DATA_ANOMALY_MATCHES:
            result = result[(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)]
        if self.data["built-form"] not in self.DATA_ANOMALY_MATCHES and self.data["built-form"] in result["BUILT_FORM"]:
            result = result[(result["BUILT_FORM"] == self.data["built-form"])]
        return result[["NUMBER_HABITABLE_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]].mean()
    def set_basic_property_dimensions(self):
        """
        This method sets the number of floors of the property, using a simple approach based on an estimate for
        average room size, number of rooms and total floor area
        It sets the perimeter of the property, using a simple approach based on an estimate for average room size,
        number of rooms and total floor area
        Also sets floor area, number of rooms, using backup cleaned values if this data is not present, based on
        medians across the EPC data
        :return:
        """
        self.floor_area = float(self.data["total-floor-area"])
        if not self.data["number-habitable-rooms"] or (
            self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES
        ):
            if self.property_dimensions is None:
                property_dimensions = read_dataframe_from_s3_parquet(
                    bucket_name=DATA_BUCKET, file_key=f"property_dimensions/{self.data['local-authority']}.parquet"
                )
                self.property_dimensions = self._filter_property_dimensions(property_dimensions)
        if not self.data["number-habitable-rooms"]:
            self.number_of_rooms = float(self.property_dimensions["NUMBER_HABITABLE_ROOMS"].round())
        else:
            self.number_of_rooms = float(self.data["number-habitable-rooms"])
        if self.data["property-type"] == "House":
            self.number_of_floors = estimate_floors(self.floor_area, self.number_of_rooms)
        elif self.data["property-type"] == "Flat":
            self.number_of_floors = 1
        else:
            raise NotImplementedError("Implement me")
        if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
            self.floor_height = float(self.property_dimensions["FLOOR_HEIGHT"].round(2))
        else:
            self.floor_height = float(self.data["floor-height"])
        self.perimeter = estimate_perimeter(
            self.floor_area / self.number_of_floors, self.number_of_rooms / self.number_of_floors
        )
        self.insulation_wall_area = estimate_wall_area(
            num_floors=self.number_of_floors, floor_height=self.floor_height, perimeter=self.perimeter
        )
    def set_wall_type(self):
        """
        This method sets the wall type of the property, using a simple approach based on the wall description
        :return:
        """
        self.wall_type = get_wall_type(**self.walls)
    def set_floor_type(self):
        """
        This method sets the floor type of the property, which is used for calculating u-values
        :return:
        """
        self.floor_type = "suspended" if self.floor["is_suspended"] else "solid"
    @staticmethod
    def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None):
        for k in component_rename_cols:
            component_data[f"{rename_prefix}_{k}"] = component_data[k]
        component_data = {
            k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols
        }
        return component_data
    def get_model_data(self):
        """
        This method extracts cleaned data from the property object, which is used in our machine learning models
        This will use many of the cleaned properties, extracted from the epc data, or methods in DataProcessor.
        For future iterations of this, we probably want to implement a singular method in DataProcessor, which can
        be used in the etl code and in here
        :return: dictionary of model data to be scored in the model
        """
        drop_cols = ["original_description", "clean_description"]
        insulation_drop_cols = ["thermal_transmittance_unit", "is_assumed", "is_valid"]
        insulation_rename_cols = ["thermal_transmittance", "insulation_thickness"]
        walls = self._extract_component(self.walls, insulation_rename_cols, insulation_drop_cols + drop_cols, "walls")
        roof = self._extract_component(self.roof, insulation_rename_cols, insulation_drop_cols + drop_cols, "roof")
        floor = self._extract_component(self.floor, insulation_rename_cols, insulation_drop_cols + drop_cols, "floor")
        windows = self._extract_component(self.windows, [], drop_cols + ["no_data"])
        fuel = self._extract_component(self.main_fuel, ["tariff_type"], drop_cols + ["tariff_type"], "main-fuel")
        main_heating = self._extract_component(self.main_heating, [], drop_cols + ["has_assumed"])
        main_heating_controls = self._extract_component(self.main_heating_controls, [], drop_cols)
        hotwater = self._extract_component(self.hotwater, ["tariff_type"], drop_cols + ['assumed'], "hotwater")
        # We'll need to clean second heating
        second_heating = self.data["secondheat-description"]
        epc_raw_columns = [
            'TRANSACTION_TYPE',
            'ENERGY_TARIFF',
            'PROPERTY_TYPE',
            'UPRN',
            'NUMBER_OPEN_FIREPLACES',
            'FIXED_LIGHTING_OUTLETS_COUNT',
            'MULTI_GLAZE_PROPORTION',
            'MECHANICAL_VENTILATION',
            'PHOTO_SUPPLY',
            'LOW_ENERGY_LIGHTING',
            'SOLAR_WATER_HEATING_FLAG',
            'GLAZED_TYPE',
            'CONSTITUENCY',
            'NUMBER_HEATED_ROOMS',
            'EXTENSION_COUNT',
        ]
        epc_raw_data = {
            k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns
        }
        built_form_cleaning_map = {
            "Flat": "Mid-Terrace",
            "House": "Semi-Detached",
            "Bungalow": "Detached",
            "Maisonette": "Mid-Terrace"
        }
        built_form = self.data["built-form"]
        if built_form in self.DATA_ANOMALY_MATCHES:
            # TODO: If built form isn't captured, we use the most common value for that property type - we shall
            #       improve this methodology
            built_form = built_form_cleaning_map.get(self.data["property-type"])
            if not built_form:
                raise NotImplementedError("Not handled this property type when cleaning built form")
        property_data = {
            **walls,
            **roof,
            **floor,
            **fuel,
            **main_heating,
            **main_heating_controls,
            **hotwater,
            **windows,
            "SECONDHEAT_DESCRIPTION": second_heating,
            "DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]),
            "SAP": float(self.data["current-energy-efficiency"]),
            "CARBON": float(self.data["co2-emissions-current"]),
            "HEAT_DEMAND": float(self.data["energy-consumption-current"]),
            "estimated_perimeter": self.perimeter,
            "CONSTRUCTION_AGE_BAND": self.construction_age_band,
            "FLOOR_HEIGHT": self.floor_height,
            "NUMBER_HABITABLE_ROOMS": self.number_of_rooms,
            "TOTAL_FLOOR_AREA": self.floor_area,
            **epc_raw_data,
            "BUILT_FORM": built_form,
        }
        return property_data
--- a/backend/app/db/functions/materials_functions.py
+++ b/backend/app/db/functions/materials_functions.py
@ -1,10 +1,17 @@
 from backend.app.db.models.materials import Material
 from functools import lru_cache
@lru_cache(maxsize=128)
 def get_materials(session):
    """
    This function will retrieve all materials from the database.
    :return: A list of Material objects if successful, an empty list otherwise.
    TODO: It might not be the best choice to store the materials data in a database table since thi
          table probably won't be very large and won't be updated that often. It might be better to
          store this data in s3 load it into memory when the app starts up. We will test this
    """
    materials = session.query(Material).filter(Material.is_active).all()
--- a/backend/app/db/models/materials.py
+++ b/backend/app/db/models/materials.py
@ -12,6 +12,7 @@ class MaterialType(enum.Enum):
    solid_floor_insulation = "solid_floor_insulation"
    external_wall_insulation = "external_wall_insulation"
    internal_wall_insulation = "internal_wall_insulation"
    cavity_wall_insulation = "cavity_wall_insulation"
 class DepthUnit(enum.Enum):
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -1,50 +1,41 @@
 from collections import defaultdict
 from fastapi import APIRouter, Depends
 from backend.app.db.models.portfolio import rating_lookup
 from backend.app.dependencies import validate_token
 from backend.app.plan.schemas import PlanTriggerRequest
 from backend.app.utils import read_csv_from_s3
 from backend.app.config import get_settings
 from backend.Property import Property
 from epc_api.client import EpcClient
 from utils.logger import setup_logger
 from utils.s3 import read_from_s3
 from recommendations.FloorRecommendations import FloorRecommendations
 from recommendations.WallRecommendations import WallRecommendations
 from recommendations.config import UPGRADES_MAP
 from utils.uvalue_estimates import classify_decile_newvalues
 from backend.app.db.utils import row2dict
 from starlette.responses import Response
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.exc import IntegrityError, OperationalError
 from datetime import datetime
 import pandas as pd
-import msgpack
+from epc_api.client import EpcClient
 from fastapi import APIRouter, Depends
 from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.orm import sessionmaker
 from starlette.responses import Response
-# model apis
+from backend.app.config import get_settings
-from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
+from backend.app.db.connection import db_engine
 # database interaction functions
 from backend.app.db.functions.property_functions import (
    create_property, create_property_targets, update_property_data, create_property_details_epc
 )
 from backend.app.db.functions.materials_functions import get_materials
 from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
 from backend.app.db.functions.property_functions import (
    create_property, create_property_details_epc, create_property_targets, update_property_data
 )
 from backend.app.db.functions.recommendations_functions import (
    create_plan, create_plan_recommendations, upload_recommendations
 )
-from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
+from backend.app.db.models.portfolio import rating_lookup
-from backend.app.db.connection import db_engine
+from backend.app.dependencies import validate_token
 from backend.app.plan.schemas import PlanTriggerRequest
 from backend.app.plan.utils import (
    create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
 )
 from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
-from model_data.optimiser.GainOptimiser import GainOptimiser
+from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
-from model_data.optimiser.CostOptimiser import CostOptimiser
+from backend.Property import Property
-from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
+from etl.epc.DataProcessor import DataProcessor
-from model_data.optimiser.optimiser_functions import prepare_input_measures
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
-from model_data.simulation_system.core.DataProcessor import DataProcessor
+from recommendations.FloorRecommendations import FloorRecommendations
-from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON
+from recommendations.optimiser.CostOptimiser import CostOptimiser
-
+from recommendations.optimiser.GainOptimiser import GainOptimiser
-# TODO: This is placeholder until data is stored in DB
+from recommendations.optimiser.optimiser_functions import prepare_input_measures
-from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
+from recommendations.WallRecommendations import WallRecommendations
-from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
+from utils.logger import setup_logger
 from utils.s3 import read_dataframe_from_s3_parquet
 logger = setup_logger()
@ -55,147 +46,25 @@ router = APIRouter(
    responses={404: {"description": "Not found"}}
 )
 # TODO: Load this data from db
 open_uprn_data = [
    {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
     'LONGITUDE': -0.0540506},
    {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
     'LONGITUDE': -0.0498772},
    {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
     'LONGITUDE': -0.226392},
    {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
     'LONGITUDE': -0.0792445},
    {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
     'LONGITUDE': -0.0792445},
    {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
     'LONGITUDE': -0.0468833},
    {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
     'LONGITUDE': -0.1362513},
    {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
     'LONGITUDE': -0.0823165}
 ]
 in_conservation_area_data = [
    {'uprn': 6032920, 'is_in_conservation_area': 'not_in_conservation_area'},
    {'uprn': 6038625, 'is_in_conservation_area': 'not_in_conservation_area'},
    {'uprn': 34153991, 'is_in_conservation_area': 'unknown'},
    {'uprn': 10008299676, 'is_in_conservation_area': 'in_conservation_area'},
    {'uprn': 10008299677, 'is_in_conservation_area': 'in_conservation_area'},
    {'uprn': 100021039066, 'is_in_conservation_area': 'not_in_conservation_area'},
    {'uprn': 100021226060, 'is_in_conservation_area': 'in_conservation_area'},
    {'uprn': 200003489276, 'is_in_conservation_area': 'in_conservation_area'}
 ]
 # TODO: db
 floors_decile_data = {
    'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
                      'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 50., 56., 69., 77.6, 87., 98., 112.,
                                                                      127., 150., 2279.]}
 walls_decile_data = {
    'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
                      'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
                                                                      120., 2279.]}
 def filter_materials(materials):
    materials_by_type = defaultdict(list)
    for material in materials:
        material = row2dict(material)
        material_type = material["type"]
        materials_by_type[material_type].append(material)
    # Optionally, you can convert the defaultdict to a normal dict if desired
    materials_by_type = dict(materials_by_type)
    return materials_by_type
 def insert_temp_recommendation_id(property_recommendations):
    """
    Creates a temporary recommendation id which is needed for
    filtering recommendations between default and no, after the optimiser has been
    run
    :param property_recommendations:  nested list of recommendations, grouped by data_types
    :return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
             integer inserted
    """
    idx = 0
    for recs in property_recommendations:
        for rec in recs:
            rec["recommendation_id"] = idx
            idx += 1
    return property_recommendations
 def get_cleaned():
    """
    This function will retrieve the cleaned dataset from s3 which has the cleaned
    descriptions for the epc dataset
    This data is stored in MessagePack format and therefore needs to be decoded
    :return:
    """
    cleaned = read_from_s3(
        s3_file_name="cleaned_epc_data/cleaned.bson",
        bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
    )
    cleaned = msgpack.unpackb(cleaned, raw=False)
    return cleaned
 def create_recommendation_scoring_data(
    property: Property,
    recommendation: dict,
    starting_epc_data: pd.DataFrame,
    ending_epc_data: pd.DataFrame,
    fixed_data: pd.DataFrame,
 ):
    """
    This wrapper function prepares data to be passed to the sap model api
    :return:
    """
    scoring_dict = {
        "UPRN": property.data["uprn"],
        "id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
        "LOCAL_AUTHORITY": property.data["local-authority"],
        **starting_epc_data.to_dict("records")[0],
        **ending_epc_data.to_dict("records")[0],
        **fixed_data.to_dict("records")[0]
    }
    # We update the description to indicate it's insulated
    if recommendation["type"] == "wall_insulation":
        scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
    elif recommendation["type"] == "floor_insulation":
        scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
    else:
        raise NotImplementedError("Implement me")
    return scoring_dict
@router.post("/trigger")
 async def trigger_plan(body: PlanTriggerRequest):
    logger.info("Connecting to db")
    session = sessionmaker(bind=db_engine)()
-    created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    created_at = datetime.now().isoformat()
    try:
        session.begin()
        logger.info("Getting the inputs")
        # Read in the trigger file from s3
        bucket_name = get_settings().PLAN_TRIGGER_BUCKET
        epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
        plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
        uprn_filenames = read_dataframe_from_s3_parquet(
            bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet"
        )
        cleaning_data = read_parquet_from_s3(
            bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
        )
        plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
        input_properties = []
        for config in plan_input:
            # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
@ -228,32 +97,21 @@ async def trigger_plan(body: PlanTriggerRequest):
        if not input_properties:
            return Response(status_code=204)
-        logger.info("Getting EPC, coordinates and conservation area data")
+        logger.info("Getting EPC, and spatial data")
        for p in input_properties:
            p.search_address_epc()
            p.set_year_built()
-
+            p.get_spatial_data(uprn_filenames)
            coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
            p.set_coordinates(coordinate_data)
            in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
                "is_in_conservation_area"
            )
            p.set_is_in_conservation_area(in_conservation_area)
        # The materials data could be cached or local so we don't need to make
        # consistent requests to the backend for
        # the same data
        # TODO: It might not be the best choice to store the materials data in a database table since thi
        #       table probably won't be very large and won't be updated that often. It might be better to
        #       store this data in s3 load it into memory when the app starts up. We will test this
        logger.info("Reading in materials and cleaned datasets")
        materials = get_materials(session)
        materials_by_type = filter_materials(materials)
        cleaned = get_cleaned()
-        logger.info("Getting components and properties recommendations")
+        logger.info("Getting components and epc recommendations")
        # TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
        #      in as a dependency and then the optimisers can take the input measures in as part of the setup() method
@ -263,34 +121,13 @@ async def trigger_plan(body: PlanTriggerRequest):
        for p in input_properties:
            property_recommendations = []
            # For each property, classiy floor area decide
            total_floor_area_group_decile = classify_decile_newvalues(
                decile_boundaries=floors_decile_data["decile_boundaries"],
                decile_labels=floors_decile_data["decile_labels"],
                new_values=[float(p.data["total-floor-area"])],
            )[0]
            # Property recommendations
            p.get_components(cleaned)
            # This is placeholder, until the full dataset is loaded into the database and we just make a read to the
            # database
            floors_u_value_estimate = [
                x for x in uvalue_estimates_floors
                if (x['local-authority'] == p.data["local-authority"]) &
                   (x['property-type'] == p.data["property-type"]) &
                   (x['built-form'] == p.data["built-form"]) &
                   (x['floor-energy-eff'] == p.data["floor-energy-eff"] if p.data[
                                                                               "floor-energy-eff"] != 'N/A' else True) &
                   (x['floor-env-eff'] == p.data["floor-env-eff"] if p.data["floor-env-eff"] != 'N/A' else True)
            ]
            # Floor recommendations
            floor_recommender = FloorRecommendations(
                property_instance=p,
-                uvalue_estimates=floors_u_value_estimate,
+                materials=materials_by_type["floor"],
                total_floor_area_group_decile=total_floor_area_group_decile,
                materials=materials_by_type["suspended_floor_insulation"] + materials_by_type["solid_floor_insulation"],
            )
            floor_recommender.recommend()
@ -298,30 +135,10 @@ async def trigger_plan(body: PlanTriggerRequest):
                property_recommendations.append(floor_recommender.recommendations)
            # Wall recommendations
            # We would make this u-value query directly to the database
            total_floor_area_group_decile = classify_decile_newvalues(
                decile_boundaries=walls_decile_data["decile_boundaries"],
                decile_labels=walls_decile_data["decile_labels"],
                new_values=[float(p.data["total-floor-area"])],
            )[0]
            # This is placeholder, until the full dataset is loaded into the database and we just make a read to the
            # database
            walls_u_value_estimate = [
                x for x in uvalue_estimates_walls
                if (x['local-authority'] == p.data["local-authority"]) &
                   (x['property-type'] == p.data["property-type"]) &
                   (x['built-form'] == p.data["built-form"]) &
                   (x['walls-energy-eff'] == p.data["walls-energy-eff"] if p.data[
                                                                               "walls-energy-eff"] != 'N/A' else True) &
                   (x['walls-env-eff'] == p.data["walls-env-eff"] if p.data["walls-env-eff"] != 'N/A' else True)
            ]
            wall_recomender = WallRecommendations(
                property_instance=p,
-                uvalue_estimates=walls_u_value_estimate,
+                materials=materials_by_type["walls"]
                total_floor_area_group_decile=total_floor_area_group_decile,
                materials=materials_by_type["external_wall_insulation"] + materials_by_type["internal_wall_insulation"]
            )
            wall_recomender.recommend()
@ -337,12 +154,8 @@ async def trigger_plan(body: PlanTriggerRequest):
            recommendations[p.id] = property_recommendations
            # Finally, we'll prepare data for predicting the impact on SAP
            # TODO: We should use the cleaned data from get_components in the data rather than the raw
            #       values. We should create a method in Property which takes the EPC data and inserts the cleaned
            #       data
            data_processor = DataProcessor(None, newdata=True)
-            data_processor.insert_data(pd.DataFrame([p.data.copy()]))
+            data_processor.insert_data(pd.DataFrame([p.get_model_data()]))
            data_processor.pre_process()
            starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
@ -350,10 +163,10 @@ async def trigger_plan(body: PlanTriggerRequest):
            fixed_data = data_processor.get_fixed_features()
            # We update the ending record with the recommended updates and we set lodgement date to today
-            ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
+            ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at)
            for recommendations_by_type in property_recommendations:
-                for rec in recommendations_by_type:
+                for i, rec in enumerate(recommendations_by_type):
                    scoring_dict = create_recommendation_scoring_data(
                        property=p,
                        recommendation=rec,
@ -370,15 +183,6 @@ async def trigger_plan(body: PlanTriggerRequest):
        logger.info("Preparing data for scoring in sap change api")
        recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
        # Clean the data
        logger.info("Reading in cleaning dataset from s3")
        cleaning_data = read_parquet_from_s3(
            bucket_name=get_settings().DATA_BUCKET,
            file_key="sap_change_model/cleaning_dataset.parquet",
        ).rename(columns={"local-authority": "LOCAL_AUTHORITY"})
        # Merge the cleaning data onto recommendations_scoring_data
        # Perform the same cleaning as in the model
        recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
            data_to_clean=recommendations_scoring_data,
@ -386,6 +190,13 @@ async def trigger_plan(body: PlanTriggerRequest):
            cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
        ).drop(columns=["LOCAL_AUTHORITY"])
        recommendations_scoring_data = DataProcessor.clean_missings_after_description_process(
            recommendations_scoring_data, [
                c for c in recommendations_scoring_data.columns if
                ("thermal_transmittance" in c) or ("insulation_thickness" in c)
            ]
        )
        sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
        file_location = sap_change_model_api.upload_scoring_data(
            df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
@ -396,14 +207,17 @@ async def trigger_plan(body: PlanTriggerRequest):
        # Retrieve the predictions
        predictions = pd.DataFrame(
-            read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
+            read_parquet_from_s3(
                bucket_name=get_settings().PREDICTIONS_BUCKET,
                file_key=response["storage_filepath"].split(get_settings().PREDICTIONS_BUCKET + "/")[1]
            )
        )
-        predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
+        predictions["predictions"] = predictions["predictions"].astype(float).round(1)
        predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)
        # Insert the predictions into the recommendations and run the optimiser
-        logger.info("Storing recommendations")
+        logger.info("Optimising recommendations")
        for property_id in recommendations.keys():
            property = [p for p in input_properties if p.id == property_id][0]
@ -411,9 +225,11 @@ async def trigger_plan(body: PlanTriggerRequest):
            for recommendations_by_type in recommendations[property_id]:
                for rec in recommendations_by_type:
-                    rec["sap_points"] = property_predictions[property_predictions["recommendation_id"] == str(
+                    new_sap = property_predictions[property_predictions["recommendation_id"] == str(
                        rec["recommendation_id"]
-                    )]["RDSAP_CHANGE"].values[0]
+                    )]["predictions"].values[0]
                    rec["sap_points"] = new_sap - float(property.data["current-energy-efficiency"])
                    if rec["sap_points"] is None:
                        raise ValueError("Sap points missing")
@ -451,8 +267,6 @@ async def trigger_plan(body: PlanTriggerRequest):
            final_recommendations = [
                rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type
            ]
            # We update recommendations[property_id]
            recommendations[property_id] = final_recommendations
        # 1) the property data
--- a/backend/app/plan/temp_script_for_flight.py
+++ b/backend/app/plan/temp_script_for_flight.py
@ -0,0 +1,176 @@
 from datetime import datetime
 import pandas as pd
 from epc_api.client import EpcClient
 from fastapi import APIRouter, Depends
 from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.orm import sessionmaker
 from starlette.responses import Response
 from backend.app.config import get_settings
 from backend.app.db.connection import db_engine
 from backend.app.db.functions.materials_functions import get_materials
 from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
 from backend.app.db.functions.property_functions import (
    create_property, create_property_details_epc, create_property_targets, update_property_data
 )
 from backend.app.db.functions.recommendations_functions import (
    create_plan, create_plan_recommendations, upload_recommendations
 )
 from backend.app.db.models.portfolio import rating_lookup
 from backend.app.dependencies import validate_token
 from backend.app.plan.schemas import PlanTriggerRequest
 from backend.app.plan.utils import (
    create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
 )
 from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
 from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
 from backend.Property import Property
 from etl.epc.DataProcessor import DataProcessor
 from etl.epc.settings import COLUMNS_TO_MERGE_ON
 from recommendations.FloorRecommendations import FloorRecommendations
 from recommendations.optimiser.CostOptimiser import CostOptimiser
 from recommendations.optimiser.GainOptimiser import GainOptimiser
 from recommendations.optimiser.optimiser_functions import prepare_input_measures
 from recommendations.WallRecommendations import WallRecommendations
 from utils.logger import setup_logger
 from utils.s3 import read_dataframe_from_s3_parquet
 logger = setup_logger()
 import pickle
 with open('local_data.pickle', 'rb') as f:
    local_data = pickle.load(f)
 with open("property_dimensions.pickle", "rb") as f:
    property_dimensions = pickle.load(f)
 with open("sap_change_dataset.pickle", "rb") as f:
    sap_change_dataset = pickle.load(f)
 created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
 plan_input = local_data["plan_input"]
 uprn_filenames = local_data["uprn_filenames"]
 local_property_data = local_data["local_property_data"]
 materials = local_data["materials"]
 materials_by_type = filter_materials(materials)
 cleaned = local_data["cleaned"]
 cleaning_data = local_data["cleaning_data"]
 # Need to find some proper materials
 materials_by_type["walls"] += [
    {'id': 4, 'type': 'cavity_wall_insulation', 'description': 'Example Material 1',
     'depths': None,
     'depth_unit': None, 'cost': 20,
     'cost_unit': 'gbp_sq_meter', 'r_value_per_mm': 0.0278, 'r_value_unit': 'square_meter_kelvin_per_watt',
     'thermal_conductivity': 0.036, 'thermal_conductivity_unit': 'watt_per_meter_kelvin',
     'link': None, 'created_at': None, 'is_active': True},
    {'id': 10, 'type': "cavity_wall_insulation", 'description': 'Example Material 2',
     'depths': None, 'depth_unit': None, 'cost': 25, 'cost_unit': 'gbp_sq_meter',
     'r_value_per_mm': 0.02631579, 'r_value_unit': 'square_meter_kelvin_per_watt', 'thermal_conductivity': 0.038,
     'thermal_conductivity_unit': 'watt_per_meter_kelvin',
     'link': None,
     'created_at': None, 'is_active': True}
 ]
 epc_client = EpcClient(auth_token="NO-TOKEN")
 input_properties = []
 for i, config in enumerate(plan_input):
    property_id = local_property_data[i]["id"]
    input_properties.append(
        Property(
            postcode=config['postcode'],
            address1=config['address'],
            epc_client=epc_client,
            id=property_id
        )
    )
 logger.info("Getting EPC, and spatial data")
 for i, p in enumerate(input_properties):
    p.data = local_property_data[i]["data"]
    p.uprn = local_property_data[i]["uprn"]
    p.id = local_property_data[i]["id"]
    p.full_sap_epc = local_property_data[i]["full_sap_epc"]
    p.old_data = local_property_data[i]["old_data"]
    p.is_listed = False
    p.in_conservation_area = False
    p.is_heritage = False
    p.set_year_built()
    # TODO: TESTING
    p.data['number-habitable-rooms'] = 3
 recommendations = {}
 recommendations_scoring_data = []
 for p in input_properties:
    property_recommendations = []
    # Property recommendations
    p.get_components(cleaned)
    # Floor recommendations
    floor_recommender = FloorRecommendations(
        property_instance=p,
        materials=materials_by_type["floor"],
    )
    floor_recommender.recommend()
    if floor_recommender.recommendations:
        property_recommendations.append(floor_recommender.recommendations)
    # Wall recommendations
    wall_recomender = WallRecommendations(
        property_instance=p,
        materials=materials_by_type["walls"]
    )
    wall_recomender.recommend()
    if wall_recomender.recommendations:
        property_recommendations.append(wall_recomender.recommendations)
    # We insert temporary ids into the recommendations which is important for the optimiser later
    property_recommendations = insert_temp_recommendation_id(property_recommendations)
    if not property_recommendations:
        continue
    recommendations[p.id] = property_recommendations
    # Finally, we'll prepare data for predicting the impact on SAP
    # TODO: We should use the cleaned data from get_components in the data rather than the raw
    #       values. We should create a method in Property which takes the EPC data and inserts the cleaned
    #       data
    data_processor = DataProcessor(None, newdata=True)
    data_processor.insert_data(pd.DataFrame([p.data.copy()]))
    data_processor.pre_process()
    starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
    ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
    fixed_data = data_processor.get_fixed_features()
    # We update the ending record with the recommended updates and we set lodgement date to today
    ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
    for recommendations_by_type in property_recommendations:
        for rec in recommendations_by_type:
            scoring_dict = create_recommendation_scoring_data(
                property=p,
                recommendation=rec,
                starting_epc_data=starting_epc_data,
                ending_epc_data=ending_epc_data,
                fixed_data=fixed_data,
            )
            recommendations_scoring_data.append(scoring_dict)
 # cleanup
 del data_processor
--- a/backend/app/plan/utils.py
+++ b/backend/app/plan/utils.py
@ -0,0 +1,187 @@
 import pandas as pd
 from backend.Property import Property
 from collections import defaultdict
 from utils.s3 import read_from_s3
 from recommendations.config import UPGRADES_MAP
 from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value
 from backend.app.db.utils import row2dict
 from backend.app.config import get_settings
 import msgpack
 def filter_materials(materials):
    materials_by_type = defaultdict(list)
    mapping = {
        "walls": ["internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation"],
        "floor": ["suspended_floor_insulation", "solid_floor_insulation"]
    }
    materials = [row2dict(material) for material in materials]
    for component, types in mapping.items():
        materials_by_type[component] = [part for part in materials if part["type"] in types]
    return dict(materials_by_type)
 def insert_temp_recommendation_id(property_recommendations):
    """
    Creates a temporary recommendation id which is needed for
    filtering recommendations between default and no, after the optimiser has been
    run
    :param property_recommendations:  nested list of recommendations, grouped by data_types
    :return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
             integer inserted
    """
    idx = 0
    for recs in property_recommendations:
        for rec in recs:
            rec["recommendation_id"] = idx
            idx += 1
    return property_recommendations
 def get_cleaned():
    """
    This function will retrieve the cleaned dataset from s3 which has the cleaned
    descriptions for the epc dataset
    This data is stored in MessagePack format and therefore needs to be decoded
    :return:
    """
    cleaned = read_from_s3(
        s3_file_name="cleaned_epc_data/cleaned.bson",
        bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
    )
    cleaned = msgpack.unpackb(cleaned, raw=False)
    return cleaned
 def create_recommendation_scoring_data(
    property: Property,
    recommendation: dict,
    starting_epc_data: pd.DataFrame,
    ending_epc_data: pd.DataFrame,
    fixed_data: pd.DataFrame,
 ):
    """
    This wrapper function prepares data to be passed to the sap model api
    :return:
    """
    scoring_dict = {
        "UPRN": property.data["uprn"],
        "id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
        "LOCAL_AUTHORITY": property.data["local-authority"],
        **starting_epc_data.to_dict("records")[0],
        **ending_epc_data.to_dict("records")[0],
        **fixed_data.to_dict("records")[0]
    }
    # Set staring u-values if we don't have them
    if not scoring_dict["walls_thermal_transmittance"]:
        scoring_dict["walls_thermal_transmittance"] = get_wall_u_value(
            clean_description=property.walls["clean_description"],
            age_band=property.age_band,
            is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
            is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
        )
    if not scoring_dict["floor_thermal_transmittance"]:
        scoring_dict["floor_thermal_transmittance"] = get_floor_u_value(
            floor_type=property.floor_type,
            area=property.floor_area,
            perimeter=property.perimeter,
            wall_type=property.wall_type,
            insulation_thickness=property.floor["insulation_thickness"],
            age_band=property.age_band,
        )
    if not scoring_dict["roof_thermal_transmittance"]:
        scoring_dict["roof_thermal_transmittance"] = get_roof_u_value(
            insulation_thickness=property.roof["insulation_thickness"],
            has_dwelling_above=property.roof["has_dwelling_above"],
            is_loft=property.roof["is_loft"],
            is_roof_room=property.roof["is_roof_room"],
            is_thatched=property.roof["is_thatched"],
            age_band=property.age_band,
            is_flat=property.roof["is_flat"],
            is_pitched=property.roof["is_pitched"],
            is_at_rafters=property.roof["is_at_rafters"],
        )
    for col in [
        "walls_insulation_thickness", "floor_insulation_thickness", "roof_insulation_thickness"
    ]:
        if scoring_dict[col] is None:
            scoring_dict[col] = "none"
    # We update the description to indicate it's insulated
    if recommendation["type"] == "wall_insulation":
        # The upgrade made here is to the u-value of the walls and the description of the
        # insulation thickness
        scoring_dict["walls_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
        scoring_dict["walls_insulation_thickness_ENDING"] = "above average"
    else:
        if not scoring_dict["walls_thermal_transmittance_ENDING"]:
            scoring_dict["walls_thermal_transmittance_ENDING"] = get_wall_u_value(
                clean_description=property.walls["clean_description"],
                age_band=property.age_band,
                is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
                is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
            )
        if scoring_dict["walls_insulation_thickness_ENDING"] is None:
            scoring_dict["walls_insulation_thickness_ENDING"] = "none"
    # Update description to indicate it's insulate
    if recommendation["type"] == "floor_insulation":
        if len(recommendation["parts"]) > 1:
            raise NotImplementedError("Have more than 1 floor insulation part - handle this case")
        scoring_dict["floor_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
        # We don't really see above average for this in the training data
        scoring_dict["floor_insulation_thickness_ENDING"] = "average"
    else:
        if not scoring_dict["floor_thermal_transmittance_ENDING"]:
            scoring_dict["floor_thermal_transmittance_ENDING"] = get_floor_u_value(
                floor_type=property.floor_type,
                area=property.floor_area,
                perimeter=property.perimeter,
                wall_type=property.wall_type,
                insulation_thickness=property.floor["insulation_thickness"],
                age_band=property.age_band,
            )
        if scoring_dict["floor_insulation_thickness_ENDING"] is None:
            scoring_dict["floor_insulation_thickness_ENDING"] = "none"
    if recommendation["type"] not in ["wall_insulation", "floor_insulation"]:
        raise NotImplementedError("Implement me")
    if not scoring_dict["roof_thermal_transmittance_ENDING"]:
        scoring_dict["roof_thermal_transmittance_ENDING"] = get_roof_u_value(
            insulation_thickness=property.roof["insulation_thickness"],
            has_dwelling_above=property.roof["has_dwelling_above"],
            is_loft=property.roof["is_loft"],
            is_roof_room=property.roof["is_roof_room"],
            is_thatched=property.roof["is_thatched"],
            age_band=property.age_band,
            is_flat=property.roof["is_flat"],
            is_pitched=property.roof["is_pitched"],
            is_at_rafters=property.roof["is_at_rafters"],
        )
        if scoring_dict["roof_insulation_thickness_ENDING"] is None:
            scoring_dict["roof_insulation_thickness_ENDING"] = "none"
    return scoring_dict
--- a/backend/app/plan/uvalue_estimates_floors.py
+++ b/backend/app/plan/uvalue_estimates_floors.py
--- a/backend/app/plan/uvalue_estimates_walls.py
+++ b/backend/app/plan/uvalue_estimates_walls.py
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
    if sap_points <= 0 or sap_points > 100:
        raise ValueError("SAP points should be between 1 and 100.")
-    if sap_points > 91:
+    if sap_points >= 92:
        return "A"
-    elif sap_points > 80:
+    elif sap_points >= 81:
        return "B"
-    elif sap_points > 69:
+    elif sap_points >= 69:
        return "C"
-    elif sap_points > 55:
+    elif sap_points >= 55:
        return "D"
-    elif sap_points > 39:
+    elif sap_points >= 39:
        return "E"
-    elif sap_points > 21:
+    elif sap_points >= 21:
        return "F"
    else:
        return "G"
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
    elif epc == "B":
        return 81
    elif epc == "C":
-        return 70
+        return 69
    elif epc == "D":
-        return 56
+        return 55
    elif epc == "E":
-        return 40
+        return 39
    elif epc == "F":
-        return 22
+        return 21
    elif epc == "G":
        return 1
    else:
--- a/backend/ml_models/sap_change_model/api.py
+++ b/backend/ml_models/sap_change_model/api.py
@ -62,14 +62,14 @@ class SAPChangeModelAPI:
        logger.info("Making request to sap change api")
        url = f"{self.base_url}/sapmodel/predict"
        payload = {
-            "file_location": f"s3://retrofit-data-dev/{file_location}",
+            "file_location": file_location,
            "property_id": "",  # This should get removed
            "portfolio_id": self.portfolio_id,
            "created_at": self.timestamp
        }
        try:
-            response = requests.post(url, json=payload, headers={"Content-Type": "application/json"})
+            response = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, timeout=120)
            # Check if the response status code is 2xx (success)
            response.raise_for_status()
--- a/backend/requirements/base.txt
+++ b/backend/requirements/base.txt
@ -34,4 +34,5 @@ pytz==2023.3
 mip==1.15.0
 boto3==1.28.3
 pandas==1.5.3
-pyarrow==12.0.1
+pyarrow==12.0.1
 textblob
--- a/backend/tests/test_property.py
+++ b/backend/tests/test_property.py
@ -1,15 +1,17 @@
 import pytest
 import pandas as pd
 from unittest.mock import Mock
 from epc_api.client import EpcClient
 from backend.Property import Property
-from open_uprn.OpenUprnClient import OpenUprnClient
+from etl.epc_clean.EpcClean import EpcClean
 from model_data.EpcClean import EpcClean
 # Define some test data
 mock_epc_response = {
    "rows": [
        {
            "lmk-key": 1,
            "uprn": 1,
            "number-habitable-rooms": 5,
            "property-type": "House",
            "inspection-date": "2023-06-01",
            "some-other-key": "some-value",
            "roof-description": "Roof Description",
@ -34,6 +36,10 @@ mock_epc_response = {
            "construction-age-band": "England and Wales: 1967-1975"
        },
        {
            "lmk-key": 2,
            "uprn": 2,
            "number-habitable-rooms": 5,
            "property-type": "House",
            "inspection-date": "2023-05-01",
            "some-other-key": "some-other-value",
            "roof-description": "Roof Description",
@ -63,6 +69,10 @@ mock_epc_response = {
 mock_epc_response_dupe = {
    'rows': [
        {
            "lmk-key": 1,
            "uprn": 1,
            "number-habitable-rooms": 5,
            "property-type": "House",
            'inspection-date': '2023-06-01', 'some-other-key': 'some-value', 'roof-description': 'Roof Description',
            'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
            'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
@ -83,6 +93,10 @@ mock_epc_response_dupe = {
            "construction-age-band": "England and Wales: 1967-1975"
        },
        {
            "lmk-key": 2,
            "uprn": 2,
            "number-habitable-rooms": 5,
            "property-type": "House",
            'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
            'roof-description': 'Roof Description', 'walls-description': 'Walls Description',
            'windows-description': 'Windows Description', 'mainheat-description': 'Main Heating Description',
@ -104,6 +118,10 @@ mock_epc_response_dupe = {
            "construction-age-band": "England and Wales: 1967-1975"
        },
        {
            "lmk-key": 3,
            "uprn": 3,
            "number-habitable-rooms": 5,
            "property-type": "House",
            'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
            'roof-description': 'Roof Description',
            'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
@ -130,7 +148,7 @@ mock_epc_response_dupe = {
 class TestProperty:
    @pytest.fixture(autouse=True)
-    def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
+    def property_instance(self, mock_epc_client, mock_cleaner):
        property_instance = Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
        return property_instance
@ -141,29 +159,18 @@ class TestProperty:
    @pytest.fixture
    def mock_epc_client(self):
-        mock_epc_client = Mock(spec=EpcClient())
+        mock_epc_client = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
        mock_epc_client.domestic.search.return_value = mock_epc_response.copy()
        mock_epc_client.auth_token = "mocked_auth_token"
        return mock_epc_client
    @pytest.fixture
    def mock_epc_client_dupe_data(self):
-        mock_epc_client_dupe_data = Mock(spec=EpcClient())
+        mock_epc_client_dupe_data = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
        mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy()
        mock_epc_client_dupe_data.auth_token = "mocked_auth_token"
        return mock_epc_client_dupe_data
    @pytest.fixture
    def mock_open_uprn_client(self):
        mock_open_uprn_client = Mock(spec=OpenUprnClient(path=None, uprns=[12345]))
        mock_open_uprn_client.data = pd.DataFrame(
            [
                {"UPRN": 12345, "longitude": 1.2345, "latitude": 2.3456},
                {"UPRN": 12346, "longitude": 3.4567, "latitude": 4.5678}
            ]
        )
        return mock_open_uprn_client
    @pytest.fixture
    def mock_cleaner(self):
        lighting_averages = [
@ -186,9 +193,22 @@ class TestProperty:
        )
        mock_cleaner = Mock(spec=cleaner_spec)
        walls_data = {
            "original_description": "Walls Description",
            "is_cavity_wall": True,
            "is_solid_brick": False,
            "is_timber_frame": False,
            "is_system_built": False,
            "is_park_home": False,
            "is_cob": False,
            "is_sandstone_or_limestone": False,
            "is_granite_or_whinstone": False,
        }
        mock_cleaner.cleaned = {
            "roof-description": [{"original_description": "Roof Description"}],
-            "walls-description": [{"original_description": "Walls Description"}],
+            "walls-description": [walls_data],
            "windows-description": [{"original_description": "Windows Description"}],
            "mainheat-description": [{"original_description": "Main Heating Description"}],
            "hotwater-description": [{"original_description": "Hot Water Description"}],
@ -201,10 +221,10 @@ class TestProperty:
        # Should be mocked auth token
        assert inst1.epc_client.auth_token == "mocked_auth_token"
-        inst2 = Property(3, "AB12CD", "Test Address")
+        inst2 = Property(3, "AB12CD", "Test Address", epc_client=mock_epc_client)
        assert inst2.epc_client.auth_token
-        inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
+        inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"}, epc_client=mock_epc_client)
        assert inst3.data == {"some": "data"}
        data = inst3.search_address_epc()
@ -227,11 +247,23 @@ class TestProperty:
        # Verify that the components are set correctly
        assert property_instance.roof == {"original_description": "Roof Description"}
-        assert property_instance.walls == {"original_description": "Walls Description"}
+        assert property_instance.walls == {
            "original_description": "Walls Description",
            "is_cavity_wall": True,
            "is_solid_brick": False,
            "is_timber_frame": False,
            "is_system_built": False,
            "is_park_home": False,
            "is_cob": False,
            "is_sandstone_or_limestone": False,
            "is_granite_or_whinstone": False,
        }
        assert property_instance.windows == {"original_description": "Windows Description"}
        assert property_instance.main_heating == {"original_description": "Main Heating Description"}
        assert property_instance.hotwater == {"original_description": "Hot Water Description"}
        assert property_instance.wall_type == "cavity"
    def test_get_components_without_cleaned_data(self, property_instance, mock_cleaner):
        # Modify the mock EpcClean to not have cleaned data
        mock_cleaner.cleaned = {}
--- a/backend/tests/test_sap_model_prep.py
+++ b/backend/tests/test_sap_model_prep.py
--- a/conservation_areas/app.py
+++ b/conservation_areas/app.py
@ -1,51 +0,0 @@
 """
 This application reads in the open uprn data from a static location and loads it into
 our database for querying from other services
 """
 import os
 from conservation_areas.ConservationAreaClient import ConservationAreaClient
 from datatypes.datatypes import OpenUprnCoordinateData
 def app():
    conservation_area_client = ConservationAreaClient(
        historic_england_path=os.path.abspath(
            os.path.dirname(__file__)
        ) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
        gov_path=os.path.abspath(
            os.path.dirname(__file__)
        ) + "/model_data/local_data/gov-conservation-area.geojson"
    )
    conservation_area_client.read()
    # We need to iterate through the open uprn data and check if the coordinates are in a conservation area
    open_uprn_data = [
        {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
         'LONGITUDE': -0.0540506},
        {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
         'LONGITUDE': -0.0498772},
        {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
         'LONGITUDE': -0.226392},
        {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
         'LONGITUDE': -0.0792445},
        {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
         'LONGITUDE': -0.0792445},
        {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
         'LONGITUDE': -0.0468833},
        {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
         'LONGITUDE': -0.1362513},
        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
         'LONGITUDE': -0.0823165}
    ]
    result = [
        {
            "uprn": coordinates["UPRN"],
            "is_in_conservation_area": conservation_area_client.is_in_conservation_area(
                OpenUprnCoordinateData(**coordinates))
        } for coordinates in
        open_uprn_data
    ]
    # TODO: Add a method to write to the database
--- a/data_collection/README.md
+++ b/data_collection/README.md
@ -1,5 +0,0 @@
 # Data Collection
 This service is specifically focused on the collection of data external sources which aren't easily
 accessed via api or via downloadable data sources. For example, wages data requires a specific application to
 pull that data from websites, e.g. from Adzuna's api
--- a/data_collection/adzuna.py
+++ b/data_collection/adzuna.py
@ -1,86 +0,0 @@
 import requests
 import json
 from data_collection.config import ADZUNA_API_KEY, ADZUNA_APP_ID
 import pandas as pd
 import os
 import time
 from tqdm import tqdm
 """
 Table of constituencies and their codes can be downloaded from the Office of National Statistics, found here:
 https://geoportal.statistics.gov.uk/datasets/ons::westminster-parliamentary-constituencies-december-2022-names-and
 -codes-in-the-united-kingdom/explore
 """
 constituencies = pd.read_csv(
    os.path.abspath(
        os.path.dirname(
            __file__)) + "/data_collection/data/Westminster_Parliamentary_Constituencies_("
                         "December_2022)_Names_and_Codes_in_the_United_Kingdom.csv"
 )
 constituencies["location_type"] = "constituency"
 def retry_api_call(job_title, location, max_retries=10):
    for i in range(max_retries):
        try:
            response = get_adzuna_jobs(job_title, location)
            return response
        except (requests.HTTPError, requests.ConnectionError):
            print(f"Attempt {i + 1} failed. Retrying in 2 seconds...")
            time.sleep(2)
    print(f"Failed after {max_retries} attempts.")
    return None
 def get_adzuna_jobs(job_title, location):
    base_url = "https://api.adzuna.com/v1/api/jobs"
    country_code = "gb"
    url = f"{base_url}/{country_code}/search/1"
    params = {
        "app_id": ADZUNA_APP_ID,
        "app_key": ADZUNA_API_KEY,
        "results_per_page": 25,
        "what": job_title,
        "where": location,
        "content-type": "application/json",
        "distance": 10
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    jobs = json.loads(response.text)
    return jobs
 JOB_TITLES = [
    "insulation installer", "internal wall insulation installer", "external wall insulation installer",
    "cavity wall insulation installer", "loft insulation installer", "roof insulation installer",
    "spray foam insulation installer", "insulation technician", "insulation engineer", "iwi insulation installer",
    "iwi installer", "ewi insulation installer", "ewi installer", "cwi insulation installer", "cwi installer",
 ]
 results = []
 for i, job_title in enumerate(JOB_TITLES):
    print("Pulling job title %s of %s" % (str(i + 1), str(len(JOB_TITLES))))
    for _, location_config in tqdm(constituencies.iterrows(), total=constituencies.shape[0]):
        location = location_config["PCON22NM"]
        jobs = retry_api_call(job_title, location)
        time.sleep(0.5)
        if jobs["results"]:
            for job in jobs['results']:
                to_append = {
                    "job_title": job_title,
                    "search_location": location,
                    "search_location_code": location_config["PCON22CD"],
                    **job
                }
                results.append(to_append)
 results_df = pd.DataFrame(results)
--- a/data_collection/config.py
+++ b/data_collection/config.py
@ -1,7 +0,0 @@
 import os
 from dotenv import load_dotenv
 load_dotenv(dotenv_path='data_collection/.env')
 ADZUNA_API_KEY = os.environ.get('ADZUNA_API_KEY')
 ADZUNA_APP_ID = os.environ.get('ADZUNA_APP_ID')
--- a/data_collection/data/.DS_Store
+++ b/data_collection/data/.DS_Store
--- a/data_collection/data/Westminster_Parliamentary_Constituencies_(December_2022)_Names_and_Codes_in_the_United_Kingdom.csv
+++ b/data_collection/data/Westminster_Parliamentary_Constituencies_(December_2022)_Names_and_Codes_in_the_United_Kingdom.csv
@ -1,651 +0,0 @@
 PCON22CD,PCON22NM,ObjectId
 E14000530,Aldershot,1
 E14000531,Aldridge-Brownhills,2
 E14000532,Altrincham and Sale West,3
 E14000533,Amber Valley,4
 E14000534,Arundel and South Downs,5
 E14000535,Ashfield,6
 E14000536,Ashford,7
 E14000537,Ashton-under-Lyne,8
 E14000538,Aylesbury,9
 E14000539,Banbury,10
 E14000540,Barking,11
 E14000541,Barnsley Central,12
 E14000542,Barnsley East,13
 E14000543,Barrow and Furness,14
 E14000544,Basildon and Billericay,15
 E14000545,Basingstoke,16
 E14000546,Bassetlaw,17
 E14000547,Bath,18
 E14000548,Batley and Spen,19
 E14000549,Battersea,20
 E14000550,Beaconsfield,21
 E14000551,Beckenham,22
 E14000552,Bedford,23
 E14000553,Bermondsey and Old Southwark,24
 E14000554,Berwick-upon-Tweed,25
 E14000555,Bethnal Green and Bow,26
 E14000556,Beverley and Holderness,27
 E14000557,Bexhill and Battle,28
 E14000558,Bexleyheath and Crayford,29
 E14000559,Birkenhead,30
 E14000560,"Birmingham, Edgbaston",31
 E14000561,"Birmingham, Erdington",32
 E14000562,"Birmingham, Hall Green",33
 E14000563,"Birmingham, Hodge Hill",34
 E14000564,"Birmingham, Ladywood",35
 E14000565,"Birmingham, Northfield",36
 E14000566,"Birmingham, Perry Barr",37
 E14000567,"Birmingham, Selly Oak",38
 E14000568,"Birmingham, Yardley",39
 E14000569,Bishop Auckland,40
 E14000570,Blackburn,41
 E14000571,Blackley and Broughton,42
 E14000572,Blackpool North and Cleveleys,43
 E14000573,Blackpool South,44
 E14000574,Blaydon,45
 E14000575,Blyth Valley,46
 E14000576,Bognor Regis and Littlehampton,47
 E14000577,Bolsover,48
 E14000578,Bolton North East,49
 E14000579,Bolton South East,50
 E14000830,Newbury,51
 E14000831,Newcastle upon Tyne Central,52
 E14000832,Newcastle upon Tyne East,53
 E14000833,Newcastle upon Tyne North,54
 E14000834,Newcastle-under-Lyme,55
 E14000835,Newton Abbot,56
 E14000836,"Normanton, Pontefract and Castleford",57
 E14000837,North Cornwall,58
 E14000838,North Devon,59
 E14000839,North Dorset,60
 E14000840,North Durham,61
 E14000841,North East Bedfordshire,62
 E14000842,North East Cambridgeshire,63
 E14000843,North East Derbyshire,64
 E14000844,North East Hampshire,65
 E14000845,North East Hertfordshire,66
 E14000846,North East Somerset,67
 E14000847,North Herefordshire,68
 E14000848,North Norfolk,69
 E14000849,North Shropshire,70
 E14000850,North Somerset,71
 E14000851,North Swindon,72
 E14000852,North Thanet,73
 E14000853,North Tyneside,74
 E14000854,North Warwickshire,75
 E14000855,North West Cambridgeshire,76
 E14000856,North West Durham,77
 E14000857,North West Hampshire,78
 E14000858,North West Leicestershire,79
 E14000859,North West Norfolk,80
 E14000860,North Wiltshire,81
 E14000861,Northampton North,82
 E14000862,Northampton South,83
 E14000863,Norwich North,84
 E14000864,Norwich South,85
 E14000865,Nottingham East,86
 E14000866,Nottingham North,87
 E14000867,Nottingham South,88
 E14000868,Nuneaton,89
 E14000869,Old Bexley and Sidcup,90
 E14000870,Oldham East and Saddleworth,91
 E14000871,Oldham West and Royton,92
 E14000872,Orpington,93
 E14000873,Oxford East,94
 E14000874,Oxford West and Abingdon,95
 E14000875,Pendle,96
 E14000876,Penistone and Stocksbridge,97
 E14000877,Penrith and The Border,98
 E14000878,Peterborough,99
 E14000879,"Plymouth, Moor View",100
 E14000580,Bolton West,101
 E14000581,Bootle,102
 E14000582,Boston and Skegness,103
 E14000583,Bosworth,104
 E14000584,Bournemouth East,105
 E14000585,Bournemouth West,106
 E14000586,Bracknell,107
 E14000587,Bradford East,108
 E14000588,Bradford South,109
 E14000589,Bradford West,110
 E14000590,Braintree,111
 E14000591,Brent Central,112
 E14000592,Brent North,113
 E14000593,Brentford and Isleworth,114
 E14000594,Brentwood and Ongar,115
 E14000595,Bridgwater and West Somerset,116
 E14000596,Brigg and Goole,117
 E14000597,"Brighton, Kemptown",118
 E14000598,"Brighton, Pavilion",119
 E14000599,Bristol East,120
 E14000600,Bristol North West,121
 E14000601,Bristol South,122
 E14000602,Bristol West,123
 E14000603,Broadland,124
 E14000604,Bromley and Chislehurst,125
 E14000605,Bromsgrove,126
 E14000606,Broxbourne,127
 E14000607,Broxtowe,128
 E14000608,Buckingham,129
 E14000609,Burnley,130
 E14000610,Burton,131
 E14000611,Bury North,132
 E14000612,Bury South,133
 E14000613,Bury St Edmunds,134
 E14000614,Calder Valley,135
 E14000615,Camberwell and Peckham,136
 E14000616,Camborne and Redruth,137
 E14000617,Cambridge,138
 E14000618,Cannock Chase,139
 E14000619,Canterbury,140
 E14000620,Carlisle,141
 E14000621,Carshalton and Wallington,142
 E14000622,Castle Point,143
 E14000623,Central Devon,144
 E14000624,Central Suffolk and North Ipswich,145
 E14000625,Charnwood,146
 E14000626,Chatham and Aylesford,147
 E14000627,Cheadle,148
 E14000628,Chelmsford,149
 E14000629,Chelsea and Fulham,150
 E14000630,Cheltenham,151
 E14000631,Chesham and Amersham,152
 E14000632,Chesterfield,153
 E14000633,Chichester,154
 E14000634,Chingford and Woodford Green,155
 E14000635,Chippenham,156
 E14000636,Chipping Barnet,157
 E14000637,Chorley,158
 E14000638,Christchurch,159
 E14000639,Cities of London and Westminster,160
 E14000640,City of Chester,161
 E14000641,City of Durham,162
 E14000642,Clacton,163
 E14000643,Cleethorpes,164
 E14000644,Colchester,165
 E14000645,Colne Valley,166
 E14000646,Congleton,167
 E14000647,Copeland,168
 E14000648,Corby,169
 E14000649,Coventry North East,170
 E14000650,Coventry North West,171
 E14000651,Coventry South,172
 E14000652,Crawley,173
 E14000653,Crewe and Nantwich,174
 E14000654,Croydon Central,175
 E14000655,Croydon North,176
 E14000656,Croydon South,177
 E14000657,Dagenham and Rainham,178
 E14000658,Darlington,179
 E14000659,Dartford,180
 E14000660,Daventry,181
 E14000661,Denton and Reddish,182
 E14000662,Derby North,183
 E14000663,Derby South,184
 E14000664,Derbyshire Dales,185
 E14000665,Devizes,186
 E14000666,Dewsbury,187
 E14000667,Don Valley,188
 E14000668,Doncaster Central,189
 E14000669,Doncaster North,190
 E14000670,Dover,191
 E14000671,Dudley North,192
 E14000672,Dudley South,193
 E14000673,Dulwich and West Norwood,194
 E14000674,Ealing Central and Acton,195
 E14000675,Ealing North,196
 E14000676,"Ealing, Southall",197
 E14000677,Easington,198
 E14000678,East Devon,199
 E14000679,East Ham,200
 E14000780,Leeds North West,201
 E14000781,Leeds West,202
 E14000782,Leicester East,203
 E14000783,Leicester South,204
 E14000784,Leicester West,205
 E14000785,Leigh,206
 E14000786,Lewes,207
 E14000787,Lewisham East,208
 E14000788,Lewisham West and Penge,209
 E14000789,"Lewisham, Deptford",210
 E14000790,Leyton and Wanstead,211
 E14000791,Lichfield,212
 E14000792,Lincoln,213
 E14000793,"Liverpool, Riverside",214
 E14000794,"Liverpool, Walton",215
 E14000795,"Liverpool, Wavertree",216
 E14000796,"Liverpool, West Derby",217
 E14000797,Loughborough,218
 E14000798,Louth and Horncastle,219
 E14000799,Ludlow,220
 E14000800,Luton North,221
 E14000801,Luton South,222
 E14000802,Macclesfield,223
 E14000803,Maidenhead,224
 E14000804,Maidstone and The Weald,225
 E14000805,Makerfield,226
 E14000806,Maldon,227
 E14000807,Manchester Central,228
 E14000808,"Manchester, Gorton",229
 E14000809,"Manchester, Withington",230
 E14000810,Mansfield,231
 E14000811,Meon Valley,232
 E14000812,Meriden,233
 E14000813,Mid Bedfordshire,234
 E14000814,Mid Derbyshire,235
 E14000815,Mid Dorset and North Poole,236
 E14000816,Mid Norfolk,237
 E14000817,Mid Sussex,238
 E14000818,Mid Worcestershire,239
 E14000819,Middlesbrough,240
 E14000820,Middlesbrough South and East Cleveland,241
 E14000821,Milton Keynes North,242
 E14000822,Milton Keynes South,243
 E14000823,Mitcham and Morden,244
 E14000824,Mole Valley,245
 E14000825,Morecambe and Lunesdale,246
 E14000826,Morley and Outwood,247
 E14000827,New Forest East,248
 E14000828,New Forest West,249
 E14000829,Newark,250
 E14000680,East Hampshire,251
 E14000681,East Surrey,252
 E14000682,East Worthing and Shoreham,253
 E14000683,East Yorkshire,254
 E14000880,"Plymouth, Sutton and Devonport",255
 E14000684,Eastbourne,256
 E14000685,Eastleigh,257
 E14000881,Poole,258
 E14000686,Eddisbury,259
 E14000882,Poplar and Limehouse,260
 E14000687,Edmonton,261
 E14000883,Portsmouth North,262
 E14000688,Ellesmere Port and Neston,263
 E14000884,Portsmouth South,264
 E14000689,Elmet and Rothwell,265
 E14000885,Preston,266
 E14000690,Eltham,267
 E14000886,Pudsey,268
 E14000691,Enfield North,269
 E14000887,Putney,270
 E14000692,"Enfield, Southgate",271
 E14000888,Rayleigh and Wickford,272
 E14000693,Epping Forest,273
 E14000889,Reading East,274
 E14000694,Epsom and Ewell,275
 E14000890,Reading West,276
 E14000695,Erewash,277
 E14000891,Redcar,278
 E14000696,Erith and Thamesmead,279
 E14000892,Redditch,280
 E14000697,Esher and Walton,281
 E14000893,Reigate,282
 E14000698,Exeter,283
 E14000894,Ribble Valley,284
 E14000699,Fareham,285
 E14000895,Richmond (Yorks),286
 E14000700,Faversham and Mid Kent,287
 E14000896,Richmond Park,288
 E14000701,Feltham and Heston,289
 E14000897,Rochdale,290
 E14000702,Filton and Bradley Stoke,291
 E14000898,Rochester and Strood,292
 E14000703,Finchley and Golders Green,293
 E14000899,Rochford and Southend East,294
 E14000704,Folkestone and Hythe,295
 E14000900,Romford,296
 E14000705,Forest of Dean,297
 E14000901,Romsey and Southampton North,298
 E14000706,Fylde,299
 E14000902,Rossendale and Darwen,300
 E14000707,Gainsborough,301
 E14000903,Rother Valley,302
 E14000904,Rotherham,303
 E14000905,Rugby,304
 E14000906,"Ruislip, Northwood and Pinner",305
 E14000907,Runnymede and Weybridge,306
 E14000908,Rushcliffe,307
 E14000909,Rutland and Melton,308
 E14000910,Saffron Walden,309
 E14000911,Salford and Eccles,310
 E14000912,Salisbury,311
 E14000913,Scarborough and Whitby,312
 E14000914,Scunthorpe,313
 E14000915,Sedgefield,314
 E14000916,Sefton Central,315
 E14000917,Selby and Ainsty,316
 E14000918,Sevenoaks,317
 E14000919,Sheffield Central,318
 E14000920,Sheffield South East,319
 E14000921,"Sheffield, Brightside and Hillsborough",320
 E14000922,"Sheffield, Hallam",321
 E14000923,"Sheffield, Heeley",322
 E14000924,Sherwood,323
 E14000925,Shipley,324
 E14000926,Shrewsbury and Atcham,325
 E14000927,Sittingbourne and Sheppey,326
 E14000928,Skipton and Ripon,327
 E14000929,Sleaford and North Hykeham,328
 E14000730,Harrogate and Knaresborough,329
 E14000731,Harrow East,330
 E14000732,Harrow West,331
 E14000733,Hartlepool,332
 E14000734,Harwich and North Essex,333
 E14000735,Hastings and Rye,334
 E14000736,Havant,335
 E14000737,Hayes and Harlington,336
 E14000738,Hazel Grove,337
 E14000739,Hemel Hempstead,338
 E14000740,Hemsworth,339
 E14000741,Hendon,340
 E14000742,Henley,341
 E14000743,Hereford and South Herefordshire,342
 E14000744,Hertford and Stortford,343
 E14000745,Hertsmere,344
 E14000746,Hexham,345
 E14000747,Heywood and Middleton,346
 E14000748,High Peak,347
 E14000749,Hitchin and Harpenden,348
 E14000750,Holborn and St Pancras,349
 E14000751,Hornchurch and Upminster,350
 E14000752,Hornsey and Wood Green,351
 E14000753,Horsham,352
 E14000754,Houghton and Sunderland South,353
 E14000755,Hove,354
 E14000756,Huddersfield,355
 E14000757,Huntingdon,356
 E14000758,Hyndburn,357
 E14000759,Ilford North,358
 E14000760,Ilford South,359
 E14000761,Ipswich,360
 E14000762,Isle of Wight,361
 E14000763,Islington North,362
 E14000764,Islington South and Finsbury,363
 E14000765,Jarrow,364
 E14000766,Keighley,365
 E14000767,Kenilworth and Southam,366
 E14000768,Kensington,367
 E14000769,Kettering,368
 E14000770,Kingston and Surbiton,369
 E14000771,Kingston upon Hull East,370
 E14000772,Kingston upon Hull North,371
 E14000773,Kingston upon Hull West and Hessle,372
 E14000774,Kingswood,373
 E14000775,Knowsley,374
 E14000776,Lancaster and Fleetwood,375
 E14000777,Leeds Central,376
 E14000778,Leeds East,377
 E14000779,Leeds North East,378
 E14000708,Garston and Halewood,379
 E14000709,Gateshead,380
 E14000710,Gedling,381
 E14000711,Gillingham and Rainham,382
 E14000712,Gloucester,383
 E14000713,Gosport,384
 E14000714,Grantham and Stamford,385
 E14000715,Gravesham,386
 E14000716,Great Grimsby,387
 E14000717,Great Yarmouth,388
 E14000718,Greenwich and Woolwich,389
 E14000719,Guildford,390
 E14000720,Hackney North and Stoke Newington,391
 E14000721,Hackney South and Shoreditch,392
 E14000722,Halesowen and Rowley Regis,393
 E14000723,Halifax,394
 E14000724,Haltemprice and Howden,395
 E14000725,Halton,396
 E14000726,Hammersmith,397
 E14000727,Hampstead and Kilburn,398
 E14000728,Harborough,399
 E14000729,Harlow,400
 E14000930,Slough,401
 E14000931,Solihull,402
 E14000932,Somerton and Frome,403
 E14000933,South Basildon and East Thurrock,404
 E14000934,South Cambridgeshire,405
 E14000935,South Derbyshire,406
 E14000936,South Dorset,407
 E14000937,South East Cambridgeshire,408
 E14000938,South East Cornwall,409
 E14000939,South Holland and The Deepings,410
 E14000940,South Leicestershire,411
 E14000941,South Norfolk,412
 E14000942,South Northamptonshire,413
 E14000943,South Ribble,414
 E14000944,South Shields,415
 E14000945,South Staffordshire,416
 E14000946,South Suffolk,417
 E14000947,South Swindon,418
 E14000948,South Thanet,419
 E14000949,South West Bedfordshire,420
 E14000950,South West Devon,421
 E14000951,South West Hertfordshire,422
 E14000952,South West Norfolk,423
 E14000953,South West Surrey,424
 E14000954,South West Wiltshire,425
 E14000955,"Southampton, Itchen",426
 E14000956,"Southampton, Test",427
 E14000957,Southend West,428
 E14000958,Southport,429
 E14000959,Spelthorne,430
 E14000960,St Albans,431
 E14000961,St Austell and Newquay,432
 E14000962,St Helens North,433
 E14000963,St Helens South and Whiston,434
 E14000964,St Ives,435
 E14000965,Stafford,436
 E14000966,Staffordshire Moorlands,437
 E14000967,Stalybridge and Hyde,438
 E14000968,Stevenage,439
 E14000969,Stockport,440
 E14000970,Stockton North,441
 E14000971,Stockton South,442
 E14000972,Stoke-on-Trent Central,443
 E14000973,Stoke-on-Trent North,444
 E14000974,Stoke-on-Trent South,445
 E14000975,Stone,446
 E14000976,Stourbridge,447
 E14000977,Stratford-on-Avon,448
 E14000978,Streatham,449
 E14000979,Stretford and Urmston,450
 E14000980,Stroud,451
 E14000981,Suffolk Coastal,452
 E14000982,Sunderland Central,453
 E14000983,Surrey Heath,454
 E14000984,Sutton and Cheam,455
 E14000985,Sutton Coldfield,456
 E14000986,Tamworth,457
 E14000987,Tatton,458
 E14000988,Taunton Deane,459
 E14000989,Telford,460
 E14000990,Tewkesbury,461
 E14000991,The Cotswolds,462
 E14000992,The Wrekin,463
 E14000993,Thirsk and Malton,464
 E14000994,Thornbury and Yate,465
 E14000995,Thurrock,466
 E14000996,Tiverton and Honiton,467
 E14000997,Tonbridge and Malling,468
 E14000998,Tooting,469
 E14000999,Torbay,470
 E14001000,Torridge and West Devon,471
 E14001001,Totnes,472
 E14001002,Tottenham,473
 E14001003,Truro and Falmouth,474
 E14001004,Tunbridge Wells,475
 E14001005,Twickenham,476
 E14001006,Tynemouth,477
 E14001007,Uxbridge and South Ruislip,478
 E14001008,Vauxhall,479
 E14001009,Wakefield,480
 E14001010,Wallasey,481
 E14001011,Walsall North,482
 E14001012,Walsall South,483
 E14001013,Walthamstow,484
 E14001014,Wansbeck,485
 E14001015,Wantage,486
 E14001016,Warley,487
 E14001017,Warrington North,488
 E14001018,Warrington South,489
 E14001019,Warwick and Leamington,490
 E14001020,Washington and Sunderland West,491
 E14001021,Watford,492
 E14001022,Waveney,493
 E14001023,Wealden,494
 E14001024,Weaver Vale,495
 E14001025,Wellingborough,496
 E14001026,Wells,497
 E14001027,Welwyn Hatfield,498
 E14001028,Wentworth and Dearne,499
 E14001029,West Bromwich East,500
 E14001030,West Bromwich West,501
 E14001031,West Dorset,502
 E14001032,West Ham,503
 E14001033,West Lancashire,504
 E14001034,West Suffolk,505
 E14001035,West Worcestershire,506
 E14001036,Westminster North,507
 E14001037,Westmorland and Lonsdale,508
 E14001038,Weston-Super-Mare,509
 E14001039,Wigan,510
 E14001040,Wimbledon,511
 E14001041,Winchester,512
 E14001042,Windsor,513
 E14001043,Wirral South,514
 E14001044,Wirral West,515
 E14001045,Witham,516
 E14001046,Witney,517
 E14001047,Woking,518
 E14001048,Wokingham,519
 E14001049,Wolverhampton North East,520
 E14001050,Wolverhampton South East,521
 E14001051,Wolverhampton South West,522
 E14001052,Worcester,523
 E14001053,Workington,524
 E14001054,Worsley and Eccles South,525
 E14001055,Worthing West,526
 E14001056,Wycombe,527
 E14001057,Wyre and Preston North,528
 E14001058,Wyre Forest,529
 E14001059,Wythenshawe and Sale East,530
 E14001060,Yeovil,531
 E14001061,York Central,532
 E14001062,York Outer,533
 N06000001,Belfast East,534
 N06000002,Belfast North,535
 N06000003,Belfast South,536
 N06000004,Belfast West,537
 N06000005,East Antrim,538
 N06000006,East Londonderry,539
 N06000007,Fermanagh and South Tyrone,540
 N06000008,Foyle,541
 N06000009,Lagan Valley,542
 N06000010,Mid Ulster,543
 N06000011,Newry and Armagh,544
 N06000012,North Antrim,545
 N06000013,North Down,546
 N06000014,South Antrim,547
 N06000015,South Down,548
 N06000016,Strangford,549
 N06000017,Upper Bann,550
 S14000050,Ochil and South Perthshire,551
 S14000051,Orkney and Shetland,552
 S14000052,Paisley and Renfrewshire North,553
 S14000053,Paisley and Renfrewshire South,554
 S14000054,Perth and North Perthshire,555
 S14000055,"Ross, Skye and Lochaber",556
 S14000056,Rutherglen and Hamilton West,557
 S14000057,Stirling,558
 S14000058,West Aberdeenshire and Kincardine,559
 S14000059,West Dunbartonshire,560
 W07000041,Ynys Môn,561
 W07000042,Delyn,562
 W07000043,Alyn and Deeside,563
 W07000044,Wrexham,564
 W07000045,Llanelli,565
 W07000046,Gower,566
 W07000047,Swansea West,567
 W07000048,Swansea East,568
 W07000049,Aberavon,569
 W07000050,Cardiff Central,570
 W07000051,Cardiff North,571
 W07000052,Rhondda,572
 W07000053,Torfaen,573
 W07000054,Monmouth,574
 W07000055,Newport East,575
 W07000056,Newport West,576
 W07000057,Arfon,577
 W07000058,Aberconwy,578
 W07000059,Clwyd West,579
 W07000060,Vale of Clwyd,580
 W07000061,Dwyfor Meirionnydd,581
 W07000062,Clwyd South,582
 W07000063,Montgomeryshire,583
 W07000064,Ceredigion,584
 W07000065,Preseli Pembrokeshire,585
 W07000066,Carmarthen West and South Pembrokeshire,586
 W07000067,Carmarthen East and Dinefwr,587
 W07000068,Brecon and Radnorshire,588
 W07000069,Neath,589
 W07000070,Cynon Valley,590
 W07000071,Merthyr Tydfil and Rhymney,591
 W07000072,Blaenau Gwent,592
 W07000073,Bridgend,593
 W07000074,Ogmore,594
 W07000075,Pontypridd,595
 W07000076,Caerphilly,596
 W07000077,Islwyn,597
 W07000078,Vale of Glamorgan,598
 W07000079,Cardiff West,599
 W07000080,Cardiff South and Penarth,600
 N06000018,West Tyrone,601
 S14000001,Aberdeen North,602
 S14000002,Aberdeen South,603
 S14000003,Airdrie and Shotts,604
 S14000004,Angus,605
 S14000005,Argyll and Bute,606
 S14000006,"Ayr, Carrick and Cumnock",607
 S14000007,Banff and Buchan,608
 S14000008,"Berwickshire, Roxburgh and Selkirk",609
 S14000009,"Caithness, Sutherland and Easter Ross",610
 S14000010,Central Ayrshire,611
 S14000011,"Coatbridge, Chryston and Bellshill",612
 S14000012,"Cumbernauld, Kilsyth and Kirkintilloch East",613
 S14000013,Dumfries and Galloway,614
 S14000014,"Dumfriesshire, Clydesdale and Tweeddale",615
 S14000015,Dundee East,616
 S14000016,Dundee West,617
 S14000017,Dunfermline and West Fife,618
 S14000018,East Dunbartonshire,619
 S14000019,"East Kilbride, Strathaven and Lesmahagow",620
 S14000020,East Lothian,621
 S14000021,East Renfrewshire,622
 S14000022,Edinburgh East,623
 S14000023,Edinburgh North and Leith,624
 S14000024,Edinburgh South,625
 S14000025,Edinburgh South West,626
 S14000026,Edinburgh West,627
 S14000027,Na h-Eileanan an Iar,628
 S14000028,Falkirk,629
 S14000029,Glasgow Central,630
 S14000030,Glasgow East,631
 S14000031,Glasgow North,632
 S14000032,Glasgow North East,633
 S14000033,Glasgow North West,634
 S14000034,Glasgow South,635
 S14000035,Glasgow South West,636
 S14000036,Glenrothes,637
 S14000037,Gordon,638
 S14000038,Inverclyde,639
 S14000039,"Inverness, Nairn, Badenoch and Strathspey",640
 S14000040,Kilmarnock and Loudoun,641
 S14000041,Kirkcaldy and Cowdenbeath,642
 S14000042,Lanark and Hamilton East,643
 S14000043,Linlithgow and East Falkirk,644
 S14000044,Livingston,645
 S14000045,Midlothian,646
 S14000046,Moray,647
 S14000047,Motherwell and Wishaw,648
 S14000048,North Ayrshire and Arran,649
 S14000049,North East Fife,650
--- a/data_collection/local_authority.py
+++ b/data_collection/local_authority.py
@ -1 +0,0 @@
--- a/data_collection/requirements.txt
+++ b/data_collection/requirements.txt
@ -1,4 +0,0 @@
 requests
 python-dotenv
 pandas
 tqdm
--- a/model_data/init.py
+++ b/model_data/init.py
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -1,26 +1,61 @@
 from pathlib import Path
 import numpy as np
 import pandas as pd
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.simulation_system.core.Settings import (
+from etl.epc.settings import (
    DATA_PROCESSOR_SETTINGS,
    EARLIEST_EPC_DATE,
    FULLY_GLAZED_DESCRIPTIONS,
    AVERAGE_FIXED_FEATURES,
    FLOOR_LEVEL_MAP,
    BUILT_FORM_REMAP,
    COLUMNS_TO_MERGE_ON,
    COMPONENT_FEATURES,
    FIXED_FEATURES,
    COLUMNTYPES,
    RDSAP_RESPONSE,
    MAX_SAP_SCORE,
    fill_na_map,
-    FIXED_DESCRIPTON_MAPPED_FEATURES
+    STARTING_SUFFIX_COMPONENT_COLS,
    NO_SUFFIX_COMPONENT_COLS,
    ENDING_SUFFIX_COMPONENT_COLS
 )
 from recommendations.rdsap_tables import FLOOR_LEVEL_MAP
 from typing import List
 # These lookups are used to clean the construction age band
 bounds_map = {
    "England and Wales: before 1900": {"l": 0, "u": 1899},
    "England and Wales: 1930-1949": {"l": 1930, "u": 1949},
    "England and Wales: 1900-1929": {"l": 1900, "u": 1929},
    "England and Wales: 1950-1966": {"l": 1950, "u": 1966},
    "England and Wales: 1967-1975": {"l": 1967, "u": 1975},
    "England and Wales: 1976-1982": {"l": 1976, "u": 1982},
    "England and Wales: 1983-1990": {"l": 1983, "u": 1990},
    "England and Wales: 1991-1995": {"l": 1991, "u": 1995},
    "England and Wales: 1996-2002": {"l": 1996, "u": 2002},
    "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
    "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
    "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
 }
 remap = {
    "England and Wales: 2007 onwards": "England and Wales: 2007-2011"
 }
 expanded_map = {
    i: [
        label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
    ][0] for i in range(0, 3001)
 }
 def is_int(x):
    try:
        int(x)
        return True
    except:
        return False
 class DataProcessor:
    """
@ -46,66 +81,36 @@ class DataProcessor:
    def insert_data(self, data: pd.DataFrame) -> None:
        self.data = data
    @staticmethod
    def clean_construction_age_band(x):
        # Firstly, we check if it's an error value
        if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
            return x
        # Next, we check if it's a value in our map
        if bounds_map.get(x):
            return x
        # We check if it's a standard remap value
        remap_value = remap.get(x, None)
        if remap_value:
            return remap_value
        # We check if it's a number
        if is_int(x):
            x_int = int(x)
            return expanded_map[x_int]
        raise NotImplementedError("Not handled the case for value %s" % x)
    def standardise_construction_age_band(self):
        """
        This function will tidy up some of the non-standard values that are populated in the construction age
        band, which is useful for cleaning
        """
        bounds_map = {
            "England and Wales: before 1900": {"l": 0, "u": 1899},
            "England and Wales: 1930-1949": {"l": 1930, "u": 1949},
            "England and Wales: 1900-1929": {"l": 1900, "u": 1929},
            "England and Wales: 1950-1966": {"l": 1950, "u": 1966},
            "England and Wales: 1967-1975": {"l": 1967, "u": 1975},
            "England and Wales: 1976-1982": {"l": 1976, "u": 1982},
            "England and Wales: 1983-1990": {"l": 1983, "u": 1990},
            "England and Wales: 1991-1995": {"l": 1991, "u": 1995},
            "England and Wales: 1996-2002": {"l": 1996, "u": 2002},
            "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
            "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
            "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
        }
        remap = {
            "England and Wales: 2007 onwards": "England and Wales: 2007-2011"
        }
        expanded_map = {
            i: [
                label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
            ][0] for i in range(0, 3001)
        }
        def is_int(x):
            try:
                int(x)
                return True
            except:
                return False
        def clean_construction_age_band(x):
            # Firstly, we check if it's an error value
            if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
                return x
            # Next, we check if it's a value in our map
            if bounds_map.get(x):
                return x
            # We check if it's a standard remap value
            remap_value = remap.get(x, None)
            if remap_value:
                return remap_value
            # We check if it's a number
            if is_int(x):
                x_int = int(x)
                return expanded_map[x_int]
            raise NotImplementedError("Not handled the case for value %s" % x)
        self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
-            lambda x: clean_construction_age_band(x)
+            lambda x: self.clean_construction_age_band(x)
        )
        self.data = self.data[
@ -157,18 +162,6 @@ class DataProcessor:
                    break
                to_index -= 1
    def reformat_columns(self):
        """
        This function applies the re-formattng of columns from lower case to capitalised
        When requesting the epc data from the api, the columns are lower case
        and separated by a hyphen, whereas in the bulk download, the columns
        are capitalised and separated by underscores. If rename_columns is True
        we convert the columns from lower case to capitalised format
        :return:
        """
        self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
    def pre_process(self) -> pd.DataFrame:
        """
        Load data and begin initial cleaning
@ -176,22 +169,24 @@ class DataProcessor:
        if self.data is None:
            self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
        if self.newdata:
            self.reformat_columns()
        if not self.newdata:
            self.confine_data()
        self.remap_columns()
        # We have some non-standard construction age bands which we'll clean for matching
-        self.standardise_construction_age_band()
+        if not self.newdata:
-        self.clean_missing_rooms()
+            self.standardise_construction_age_band()
            self.clean_missing_rooms()
        self.recast_df_columns(
            column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
        )
-        self.clean_multi_glaze_proportion()
+
        if not self.newdata:
            self.clean_multi_glaze_proportion()
        self.clean_photo_supply()
        if not self.newdata:
@ -203,16 +198,24 @@ class DataProcessor:
            # If we have multiple EPC records, we can try and do filling
            self.fill_na_fields()
-        self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+        if not self.newdata:
            self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
        # Final re-casting after data transformed and prepared
-        self.data = self.data.astype(COLUMNTYPES)
+        coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
        self.data = self.data.astype(coltypes)
        self.na_remapping()
        return self.data
    def na_remapping(self):
-        for column, fill_value in fill_na_map.items():
+
        fill_na_map_apply = {
            k: v for k, v in fill_na_map.items() if k in self.data.columns
        } if self.newdata else fill_na_map
        for column, fill_value in fill_na_map_apply.items():
            self.data[column] = self.data[column].fillna(fill_value)
    def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
@ -255,7 +258,8 @@ class DataProcessor:
        data = data.replace(np.NAN, None)
        # Remap certain columns
-        data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
+        if not self.newdata:
            data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
        data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
        convert_to_lower = ["TRANSACTION_TYPE"]
@ -348,7 +352,7 @@ class DataProcessor:
            cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
-            # If there still is na values, use average across all properties in consituecy
+            # If there still is na values, use average across all epc in consituecy
            cleaning_averages_filled[variable] = cleaning_averages_filled[
                variable
            ].fillna(cleaning_averages_filled[variable].mean())
@ -497,9 +501,15 @@ class DataProcessor:
        """
        if suffix not in ["_STARTING", "_ENDING"]:
-            raise Exception("Suffix should be one of _STARTING or _ENFING")
+            raise Exception("Suffix should be one of _STARTING or _ENDING")
-        return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
+        if suffix == "_STARTING":
            starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
            fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS].copy()
            return pd.concat([starting_cols, fixed_cols], axis=1)
        return self.data[ENDING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
    def get_fixed_features(self) -> pd.DataFrame:
        """
@ -529,125 +539,33 @@ class DataProcessor:
        return df
-    @classmethod
+    @staticmethod
-    def difference_data(cls, df: pd.DataFrame):
+    def calculate_days_to(lodgement_date):
-        """
+        if isinstance(lodgement_date, str):
-        Given a dataframe and starting and ending columns, this function will convert the features to
+            return (
-        differenced the ending subtract the starting value, which is useful for modelling the difference responces
+                pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
-        """
+            ).days
-        # We ensure that the u value columns are co-erced to a numerical format
+        return (
-        uvalue_columns = [col for col in df.columns if "thermal_transmittance" in col]
+            pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
-        for uvalue_col in uvalue_columns:
+        ).dt.days
            df[uvalue_col] = pd.to_numeric(df[uvalue_col])
-        key_columns = [
+    @staticmethod
-            "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE",
+    def clean_missings_after_description_process(df, ignore_cols=None):
-            "SAP_STARTING", "HEAT_DEMAND_STARTING",
+        missings = pd.isnull(df).sum()
-            "CARBON_STARTING", "UPRN", "CONSTITUENCY",
+        missings = missings[missings > 0]
            "SAP_ENDING", "CARBON_ENDING", "HEAT_DEMAND_ENDING",
            "DAYS_TO_STARTING", "DAYS_TO_ENDING"
        ]
-        ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
+        if ignore_cols:
            missings = missings[~missings.index.isin(ignore_cols)]
-        columns = {x for x in df.columns if x not in ignore_cols}
+        for col in missings.index:
-
+            unique_values = df[col].unique()
-        non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
+            if True in unique_values or False in unique_values:
-        non_numerical_columns = [col for col in non_numerical_columns if col in columns]
+                df[col] = df[col].fillna(False)
-        levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
+            if "none" in unique_values:
-
+                df[col] = df[col].fillna("none")
        df = pd.get_dummies(df, columns=non_numerical_columns)
        # We make sure there is a starting and ending version of the column
        diff_columns = []
        no_diff_columns = []  # Store for debugging
        for col in columns:
            if "_ENDING" in col:
                # Don't keep the endings
                continue
            else:
-                # We have a starting column so check if we have an ending
+                df[col] = df[col].fillna("Unknown")
                if col.replace("_STARTING", "") + "_ENDING" in columns:
                    diff_columns.append(col)
                else:
                    no_diff_columns.append(col)
        if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
            raise Exception("Something went wrong, potentially missed a differencing column")
        datatypes = df.dtypes
        # Note: We also difference columns like floor area and floor height. We should experiement with this.
        # Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
        # the starting value, therefore to explain any differences in the new floor area, it may be enough to
        # just consider the difference however we can play around with this.
        # Do the differencing
        cols_to_append = {}
        for starting_col in diff_columns:
            base_col = starting_col.replace("_STARTING", "")
            if "_STARTING" in starting_col:
                ending_col = starting_col.replace("_STARTING", "_ENDING")
            else:
                ending_col = starting_col + "_ENDING"
            if starting_col not in non_numerical_columns:
                cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
                df = df.drop(columns=[starting_col, ending_col])
                continue
            level_values = list(set(levels[starting_col] + levels[ending_col]))
            level_cols = []
            for level in level_values:
                starting_level_col = "_".join([starting_col, str(level)])
                ending_level_col = "_".join([ending_col, str(level)])
                if starting_level_col not in df.columns:
                    # We have no starting, just ending
                    col_type = datatypes[ending_level_col].name
                    if col_type == "bool":
                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int)
                    else:
                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col]
                    level_cols.append(ending_level_col)
                elif ending_level_col not in df.columns:
                    # We have no ending, just starting
                    col_type = datatypes[starting_level_col].name
                    if col_type == "bool":
                        cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int)
                    else:
                        cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col]
                    level_cols.append(starting_level_col)
                else:
                    col_type = datatypes[starting_level_col].name
                    if col_type == "bool":
                        cols_to_append[f"{base_col}_{level}_DIFF"] = (
                            df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
                        )
                    else:
                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
                    level_cols.extend([starting_level_col, ending_level_col])
            # Drop the columns
            df = df.drop(columns=level_cols)
        cols_to_append = pd.DataFrame(cols_to_append)
        df = pd.concat([df, cols_to_append], axis=1)
        # Perform a final coercing of string True/False columns to boolean
        df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
        return df
--- a/model_data/simulation_system/core/FeatureProcessor.py
+++ b/model_data/simulation_system/core/FeatureProcessor.py
--- a/model_data/analysis/init.py
+++ b/model_data/analysis/init.py
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -4,25 +4,24 @@ from tqdm import tqdm
 import msgpack
 from pathlib import Path
-from model_data.simulation_system.core.Settings import (
+from etl.epc.settings import (
    MANDATORY_FIXED_FEATURES,
    LATEST_FIELD,
    COMPONENT_FEATURES,
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
    EARLIEST_EPC_DATE,
    CARBON_RESPONSE,
 )
-from model_data.simulation_system.core.DataProcessor import DataProcessor
+from etl.epc.DataProcessor import DataProcessor
-from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
+from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
 from recommendations.rdsap_tables import england_wales_age_band_lookup
 from recommendations.recommendation_utils import (
    get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter,
    get_wall_type
 )
-DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
+DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
 def get_cleaned():
@ -364,21 +363,6 @@ def make_uvalues(df):
    return df
 def clean_missings_after_description_process(df):
    missings = pd.isnull(df).sum()
    missings = missings[missings > 0]
    for col in missings.index:
        unique_values = df[col].unique()
        if True in unique_values or False in unique_values:
            df[col] = df[col].fillna(False)
        if "none" in unique_values:
            df[col] = df[col].fillna("none")
        else:
            df[col] = df[col].fillna("Unknown")
    return df
 def app():
    # Get all the files in the directory
@ -400,6 +384,8 @@ def app():
        data_processor = DataProcessor(filepath=filepath)
        df = data_processor.pre_process()
        df[df["WALLS_DESCRIPTION"].str.contains("Cavity")]["WALLS_DESCRIPTION"].unique()
        cleaning_averages = data_processor.make_cleaning_averages()
        # We have some odd cases with missing constituency so we fill
@ -512,12 +498,11 @@ def app():
        # Add some temporal features - we look at the days from the standard starting point in time
        # for the starting and ending date so all records are from a fixed point
-        data_by_urpn_df["DAYS_TO_STARTING"] = (
+        data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
-            pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
+            data_by_urpn_df["LODGEMENT_DATE_STARTING"])
-        ).dt.days
+
-        data_by_urpn_df["DAYS_TO_ENDING"] = (
+        data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
-            pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
+            data_by_urpn_df["LODGEMENT_DATE_ENDING"])
        ).dt.days
        data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
@ -544,7 +529,7 @@ def app():
        #       Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
        #       need to
-        data_by_urpn_df = clean_missings_after_description_process(data_by_urpn_df)
+        data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)
        if pd.isnull(data_by_urpn_df).sum().sum():
            raise ValueError("Null values found in dataset after process_and_prune_desriptions")
@ -564,6 +549,12 @@ def app():
    output = pd.concat(dataset)
    # Remove any records that have huge swings in their floor area
    output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
    output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
    output = output[output["tfa_diff_prop"] < 0.5]
    output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
    uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
    for uvalue_col in uvalue_columns:
        output[uvalue_col] = pd.to_numeric(output[uvalue_col])
@ -571,15 +562,7 @@ def app():
    save_dataframe_to_s3_parquet(
        df=output,
        bucket_name="retrofit-data-dev",
-        file_key="sap_change_model/dataset_without_differencing.parquet",
+        file_key="sap_change_model/dataset.parquet",
    )
    output = DataProcessor.difference_data(output)
    save_dataframe_to_s3_parquet(
        df=output,
        bucket_name="retrofit-data-dev",
        file_key="sap_change_model/dataset_with_differencing.parquet",
    )
--- a/etl/epc/requirements.txt
+++ b/etl/epc/requirements.txt
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -133,28 +133,6 @@ RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
 HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
 CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
 def ordinal(n):
    if 10 <= n % 100 <= 20:
        suffix = "th"
    else:
        suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
    return str(n) + suffix
 FLOOR_LEVEL_MAP = {
    "Basement": -1,
    "Ground": 0,
    "ground floor": 0,
    "20+": 20,
    "21st or above": 21,
    **{str(i).zfill(2): i for i in range(0, 21)},
    **{ordinal(i): i for i in range(-1, 21)},
    **{str(i): i for i in range(-1, 21)},
    **{i: i for i in range(-1, 21)},
 }
 BUILT_FORM_REMAP = {
    "Enclosed End-Terrace": "End-Terrace",
    "Enclosed Mid-Terrace": "Mid-Terrace",
@ -212,10 +190,66 @@ fill_na_map = {
    "NUMBER_OPEN_FIREPLACES": 0
 }
-# After the property descriptions have been re-remapped, we expect these features to be fixed
+################################################################################################
-FIXED_DESCRIPTON_MAPPED_FEATURES = [
+# These are the features we need for scoring
-    'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended',
+# We'll likely change how we do this in the future
-    'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters',
+################################################################################################
-    'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity',
+
-    'is_cavity_wall', 'is_thatched', 'is_to_unheated_space'
+STARTING_SUFFIX_COMPONENT_COLS = [
    "SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION",
    "SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY",
    "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES",
    "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter"
 ]
 NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall',
                            'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
                            'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone',
                            'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation',
                            'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended',
                            'is_solid', 'another_property_below', 'floor_insulation_thickness',
                            'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat',
                            'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness',
                            'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope',
                            'energy_recovery',
                            'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system',
                            'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units',
                            'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
                            'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
                            'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
                            'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
                            'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
                            'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
                            'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas',
                            'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
                            'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k',
                            'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating',
                            'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control',
                            'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
                            'rate_control',
                            'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
                            'no_individual_heating_or_community_network', 'complex_fuel_type',
                            ]
 ENDING_SUFFIX_COMPONENT_COLS = [
    'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION',
    'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION',
    'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT',
    'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness',
    'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness',
    'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type',
    'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features',
    'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators',
    'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
    'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
    'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
    'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
    'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
    'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
    'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs',
    'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood',
    'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms',
    'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control',
    'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
    'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
    'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
 ]
--- a/etl/epc_clean/EpcClean.py
+++ b/etl/epc_clean/EpcClean.py
@ -4,16 +4,16 @@ from collections import defaultdict
 import pandas as pd
-from model_data.utils import correct_spelling
+from etl.epc_clean.utils import correct_spelling
-from model_data.epc_attributes.FloorAttributes import FloorAttributes
+from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
-from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
+from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
-from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
+from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
-from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
+from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
-from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
-from model_data.epc_attributes.RoofAttributes import RoofAttributes
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
-from model_data.epc_attributes.WallAttributes import WallAttributes
+from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
-from model_data.epc_attributes.WindowAttributes import WindowAttributes
+from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
-from model_data.epc_attributes.LightingAttributes import LightingAttributes
+from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
 class EpcClean:
@ -130,7 +130,7 @@ class EpcClean:
            self.cleaned[field].append(
                {
                    "original_description": description,
-                    "clean_description": cln.description.capitalize(),
+                    "clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
                    **cln.process()
                }
            )
--- a/model_data/plotting/init.py
+++ b/model_data/plotting/init.py
--- a/model_data/cleaner_app.py
+++ b/model_data/cleaner_app.py
@ -3,8 +3,8 @@ import os
 import pandas as pd
 import msgpack
-from model_data.EpcClean import EpcClean
+from etl.epc_clean.EpcClean import EpcClean
-from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
+from etl.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
 from utils.s3 import save_data_to_s3
@ -19,7 +19,7 @@ LAND_REGISTRY_PATHS = [
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
 ]
-EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
+EPC_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
 ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
 def app():
    """
    For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
-    and produce a dataset of cleaned fields so that when we get new properties, we can quickly
+    and produce a dataset of cleaned fields so that when we get new epc, we can quickly
    sanitise any description data
    Currently, this application is just run on a local machine
@ -36,9 +36,6 @@ def app():
    cleaned_data = {}
    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
    for directory in tqdm(epc_directories):
        directory_destructured = str(directory).split("/")[-1].split("-")
        gss_code = directory_destructured[1]
        local_authority = directory_destructured[2]
        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        # Rename the columns to the same format as the api returns
@ -62,14 +59,6 @@ def app():
                new_data = [x for x in data if x["original_description"] not in existing_descriptions]
                cleaned_data[k].extend(new_data)
        # TODO: Add property age band into this
        # uvalue_estimates = UvalueEstimations(data=data)
        # uvalue_estimates.get_estimates(cleaner=cleaner)
        # # TODO: Store these to a s3
        # uvalue_estimates.walls
        # uvalue_estimates.floors
        # uvalue_estimates.roofs
    # Basic check to make sure all descriptions are unique
    for _, cleaned in cleaned_data.items():
        descriptions = [x["original_description"] for x in cleaned]
--- a/etl/epc_clean/epc_attributes/FloorAttributes.py
+++ b/etl/epc_clean/epc_attributes/FloorAttributes.py
@ -1,7 +1,7 @@
 import re
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
+from etl.epc_clean.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
 class FloorAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/HotWaterAttributes.py
+++ b/etl/epc_clean/epc_attributes/HotWaterAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
 class HotWaterAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/LightingAttributes.py
+++ b/etl/epc_clean/epc_attributes/LightingAttributes.py
@ -1,6 +1,6 @@
 import re
-from model_data.epc_attributes.attribute_utils import clean_description
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description
-from model_data.utils import correct_spelling
+from etl.epc_clean.utils import correct_spelling
 class LightingAttributes:
@ -27,7 +27,7 @@ class LightingAttributes:
        lel_match2 = re.search(r"goleuadau ynni-isel mewn (\d+)%? o'r mannau gosod", self.description)
        if lel_match is not None or lel_match2 is not None:
-            
+
            # Perform the actual translation
            percentage = lel_match.group(1) if lel_match is not None else lel_match2.group(1)
            self.description = f"low energy lighting in {percentage}% of fixed outlets"
--- a/etl/epc_clean/epc_attributes/MainFuelAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainFuelAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
 class MainFuelAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/MainheatAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatAttributes.py
@ -1,5 +1,5 @@
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
 from typing import Dict, Union
--- a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
 class MainheatControlAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/RoofAttributes.py
+++ b/etl/epc_clean/epc_attributes/RoofAttributes.py
@ -1,7 +1,7 @@
 import re
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
+from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
 class RoofAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/WallAttributes.py
+++ b/etl/epc_clean/epc_attributes/WallAttributes.py
@ -1,7 +1,7 @@
 import re
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import (
+from etl.epc_clean.epc_attributes.attribute_utils import (
    extract_component_types,
    extract_thermal_transmittance
 )
--- a/etl/epc_clean/epc_attributes/WindowAttributes.py
+++ b/etl/epc_clean/epc_attributes/WindowAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description
 class WindowAttributes(Definitions):
--- a/model_data/simulation_system/MLModel/init.py
+++ b/model_data/simulation_system/MLModel/init.py
--- a/etl/epc_clean/epc_attributes/all_cleaners.py
+++ b/etl/epc_clean/epc_attributes/all_cleaners.py
@ -0,0 +1,21 @@
 from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
 from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
 from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
 from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
 from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
 from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
 from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
 from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
 from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
 all_cleaner_map = {
    'floor-description': FloorAttributes,
    'hotwater-description': HotWaterAttributes,
    'main-fuel': MainFuelAttributes,
    'mainheat-description': MainHeatAttributes,
    'mainheatcont-description': MainheatControlAttributes,
    'roof-description': RoofAttributes,
    'walls-description': WallAttributes,
    'windows-description': WindowAttributes,
    'lighting-description:': LightingAttributes,
 }
--- a/etl/epc_clean/epc_attributes/attribute_utils.py
+++ b/etl/epc_clean/epc_attributes/attribute_utils.py
--- a/model_data/simulation_system/init.py
+++ b/model_data/simulation_system/init.py
--- a/etl/epc_clean/tests/test_attribute_utils.py
+++ b/etl/epc_clean/tests/test_attribute_utils.py
@ -1,5 +1,5 @@
 import pytest
-import model_data.epc_attributes.attribute_utils as attribute_utils
+import etl.epc_clean.epc_attributes.attribute_utils as attribute_utils
 def test_extract_thermal_transmittance():
--- a/etl/epc_clean/tests/test_data/EpcClean_inputs.obj
+++ b/etl/epc_clean/tests/test_data/EpcClean_inputs.obj
--- a/etl/epc_clean/tests/test_data/test_floor_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_floor_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_hot_water_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_hot_water_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_lighting_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_lighting_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_main_fuel_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_main_fuel_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_mainheat_control_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_mainheat_control_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_roof_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_roof_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_window_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_window_attributes_cases.py
--- a/etl/epc_clean/tests/test_epc_clean.py
+++ b/etl/epc_clean/tests/test_epc_clean.py
@ -1,6 +1,6 @@
 import pytest
 import pickle
-from model_data.EpcClean import EpcClean
+from etl.epc_clean.EpcClean import EpcClean
 from pathlib import Path
 # For local testing
--- a/etl/epc_clean/tests/test_floor_attributes.py
+++ b/etl/epc_clean/tests/test_floor_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.tests.test_data.test_floor_attributes_cases import clean_floor_cases
+from etl.epc_clean.tests.test_data.test_floor_attributes_cases import clean_floor_cases
-from model_data.epc_attributes.FloorAttributes import FloorAttributes
+from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
 class TestCleanFloor:
--- a/etl/epc_clean/tests/test_hotwater_attributes.py
+++ b/etl/epc_clean/tests/test_hotwater_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
+from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
-from model_data.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
+from etl.epc_clean.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
 class TestHotWaterAttributes:
--- a/etl/epc_clean/tests/test_lighting_attributes.py
+++ b/etl/epc_clean/tests/test_lighting_attributes.py
@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
-from model_data.tests.test_data.test_lighting_attributes_cases import test_cases
+from etl.epc_clean.tests.test_data.test_lighting_attributes_cases import test_cases
-from model_data.epc_attributes.LightingAttributes import LightingAttributes
+from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
 # An example averages dataset to use in tests. It is a dictionary where the key is a lighting description and the
 # value is the expected proportion.
--- a/etl/epc_clean/tests/test_mainfuel_attributes.py
+++ b/etl/epc_clean/tests/test_mainfuel_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
+from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
-from model_data.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
+from etl.epc_clean.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
 class TestMainHeatControlAttributes:
--- a/etl/epc_clean/tests/test_mainheat_attributes.py
+++ b/etl/epc_clean/tests/test_mainheat_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
+from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
-from model_data.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
+from etl.epc_clean.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
 class TestMainHeatAttributes:
--- a/etl/epc_clean/tests/test_mainheat_controls_attributes.py
+++ b/etl/epc_clean/tests/test_mainheat_controls_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
-from model_data.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
+from etl.epc_clean.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
 class TestMainHeatControlAttributes:
--- a/etl/epc_clean/tests/test_roof_attributes.py
+++ b/etl/epc_clean/tests/test_roof_attributes.py
@ -1,7 +1,7 @@
 import pytest
 from pathlib import Path
-from model_data.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
+from etl.epc_clean.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
-from model_data.epc_attributes.RoofAttributes import RoofAttributes
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
 # For local testing
 if __file__ == "<input>":
--- a/etl/epc_clean/tests/test_utils.py
+++ b/etl/epc_clean/tests/test_utils.py
@ -1,4 +1,4 @@
-from model_data.utils import is_percentage_or_number, correct_spelling
+from etl.epc_clean.utils import is_percentage_or_number, correct_spelling
 class TestUtils:
--- a/etl/epc_clean/tests/test_wall_attributes.py
+++ b/etl/epc_clean/tests/test_wall_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.WallAttributes import WallAttributes
+from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
-from model_data.tests.test_data.test_wall_attributes_cases import wall_cases
+from etl.epc_clean.tests.test_data.test_wall_attributes_cases import wall_cases
 class TestWallAttributes:
--- a/etl/epc_clean/tests/test_window_attributes.py
+++ b/etl/epc_clean/tests/test_window_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.WindowAttributes import WindowAttributes
+from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
-from model_data.tests.test_data.test_window_attributes_cases import windows_cases
+from etl.epc_clean.tests.test_data.test_window_attributes_cases import windows_cases
 class TestWindowAttributes:
--- a/etl/epc_clean/utils.py
+++ b/etl/epc_clean/utils.py
--- a/etl/land_registry/LandRegistryClient.py
+++ b/etl/land_registry/LandRegistryClient.py
--- a/model_data/simulation_system/core/init.py
+++ b/model_data/simulation_system/core/init.py
--- a/etl/land_registry/app.py
+++ b/etl/land_registry/app.py
--- a/etl/land_registry/sample_addresses.pkl
+++ b/etl/land_registry/sample_addresses.pkl
--- a/etl/land_registry/tests/test_land_registry_client.py
+++ b/etl/land_registry/tests/test_land_registry_client.py
@ -1,6 +1,6 @@
 import pandas as pd
 from unittest.mock import patch, call
-from model_data.LandRegistryClient import LandRegistryClient
+from etl.land_registry.LandRegistryClient import LandRegistryClient
 class TestLandRegistryClient:
--- a/etl/property_dimensions/init.py
+++ b/etl/property_dimensions/init.py
--- a/etl/property_dimensions/app.py
+++ b/etl/property_dimensions/app.py
@ -0,0 +1,54 @@
 """
 This is a simple application which estimates some of the basic dimensions of a property based on EPC
 data which we can use as a proxy value if we don't have this information on the EPC
 """
 import os
 from pathlib import Path
 import pandas as pd
 from tqdm import tqdm
 from etl.epc.settings import EARLIEST_EPC_DATE
 from etl.epc.DataProcessor import DataProcessor
 from BaseUtility import Definitions
 from utils.s3 import save_dataframe_to_s3_parquet
 DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
 GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"]
 BUCKET = os.environ.get("BUCKET", "retrofit-data-dev")
 def app():
    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
    for directory in tqdm(directories):
        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
        data = data[~pd.isnull(data["UPRN"])]
        data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float)
        data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply(
            lambda x: DataProcessor.clean_construction_age_band(x)
        )
        data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])]
        data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)]
        data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])]
        data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])]
        data = data[~pd.isnull(data["FLOOR_HEIGHT"])]
        df = (
            data.groupby(GROUPBY)
            .agg({"NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean", "FLOOR_HEIGHT": "mean"})
            .reset_index()
        )
        local_authority = data["LOCAL_AUTHORITY"].unique()
        if len(local_authority) > 1:
            raise Exception("More than one la in data")
        local_authority = local_authority[0]
        save_dataframe_to_s3_parquet(
            df=df,
            bucket_name=BUCKET,
            file_key=f"property_dimensions/{local_authority}.parquet",
        )
--- a/etl/spatial/BoreholeClient.py
+++ b/etl/spatial/BoreholeClient.py
@ -56,7 +56,7 @@ class BoreholeClient:
    # EXAMPLE
    # There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
-    # entries in here if possible before we produce any form of comparison between our properties, to infer
+    # entries in here if possible before we produce any form of comparison between our epc, to infer
    # the distance from the property to the nearest borehole
    # Let's take a sample
--- a/conservation_areas/ConservationAreaClient.py
+++ b/conservation_areas/ConservationAreaClient.py
@ -1,12 +1,55 @@
-from enum import Enum
+import boto3
 import os
 import tempfile
 import geopandas as gpd
 import numpy as np
 from enum import Enum
 from shapely.geometry import Point
 from utils.logger import setup_logger
 from utils.s3 import read_io_from_s3
 from datatypes.datatypes import OpenUprnCoordinateData
 logger = setup_logger()
 def read_shapefile_from_s3(bucket_name, s3_file_key):
    """
    Read a shapefile from S3 into a GeoDataFrame.
    :param bucket_name: The name of the S3 bucket
    :param s3_file_key: The file path of the shape file
    :return: GeoDataFrame containing the shapefile data
    """
    s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
    shape_file_key = s3_file_key.split("/")[-1]
    # Create a temporary directory
    with tempfile.TemporaryDirectory() as tmpdirname:
        s3_client = boto3.client('s3')
        # Ensure the temporary directory exists
        logger.info("Creating temporary directory at %s" % tmpdirname)
        os.makedirs(tmpdirname, exist_ok=True)
        # List all files in the given S3 folder
        s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
        # Download each file to the temporary directory
        for s3_object in s3_objects:
            file_key = s3_object['Key']
            file_name = os.path.basename(file_key)
            local_file_path = os.path.join(tmpdirname, file_name)
            # Explicitly create the temporary file
            with open(local_file_path, 'wb') as tmpfile:
                s3_client.download_fileobj(bucket_name, file_key, tmpfile)
        # Read the shapefile from the temporary directory into a GeoDataFrame
        shapefile_path = os.path.join(tmpdirname, shape_file_key)
        gdf = gpd.read_file(shapefile_path)
    return gdf
 class ConservationAreaClient:
    """
    Class to interact and manupulate convervation area data. The historic england data
@ -18,13 +61,14 @@ class ConservationAreaClient:
    """
    SOURCES = ["historic_england"]
-    IN_CONSERVATION_AREA = "in_conservation_area"
+    IN_CONSERVATION_AREA = True
-    NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
+    NOT_IN_CONSERVATION_AREA = False
-    UNKNOWN = "unknown"
+    UNKNOWN = None
-    def __init__(self, historic_england_path, gov_path):
+    def __init__(self, historic_england_path, gov_path, bucket):
        self.historic_england_path = historic_england_path
        self.gov_path = gov_path
        self.bucket = bucket
        self.historic_england_data = None
        self.gov_data = None
@ -34,11 +78,21 @@ class ConservationAreaClient:
        Read the data
        """
        logger.info("Reading in historic england conservation area shapefile")
-        self.historic_england_data = gpd.read_file(self.historic_england_path)
+        self.historic_england_data = read_shapefile_from_s3(
            bucket_name=self.bucket, s3_file_key=self.historic_england_path
        )
        logger.info("Reading in Govenment conservation area geojson")
-        self.gov_data = gpd.read_file(self.gov_path)
+
        self.gov_data = gpd.read_file(
            read_io_from_s3(
                bucket_name=self.bucket,
                file_key=self.gov_path
            )
        )
        self.gov_data = self.gov_data.drop(columns=["dataset"])
        # Convert the gov data to british national grid co-ordinates
        self.gov_data = self.gov_data.to_crs("EPSG:27700")
    def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
@ -71,6 +125,43 @@ class ConservationAreaClient:
            else:
                return ConservationAreaClient.UNKNOWN
    def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
        joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
        # Identify where we have definitive information (not "unknown")
        in_conservation_he = ~joined_gdf_he.index_right.isna() & (
            joined_gdf_he["NAME"] != "No data available for publication by HE"
        )
        uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
        # The right index will be missing when we don't have a match so the uprn is not in a conservation
        # area
        uprn_not_in_conservation_he = joined_gdf_he.loc[
            ~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
            "UPRN"
        ].unique()
        # For unknowns, check against government data
        unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
        unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
        joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
        uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
        uprn_gdf['conservation_status'] = self.UNKNOWN
        uprn_gdf.loc[
            uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
        ] = self.IN_CONSERVATION_AREA
        uprn_gdf.loc[
            uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
        ] = self.NOT_IN_CONSERVATION_AREA
        uprn_gdf.loc[
            uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
        ] = self.IN_CONSERVATION_AREA
        return uprn_gdf
    def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
        """
        Check if a property is in a conservation area
--- a/etl/spatial/OpenUprnClient.py
+++ b/etl/spatial/OpenUprnClient.py
@ -0,0 +1,118 @@
 import os
 from tqdm import tqdm
 import pandas as pd
 import geopandas as gpd
 from utils.logger import setup_logger
 from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
 logger = setup_logger()
 class OpenUprnClient:
    """
    This client reads in the Open UPRN data from s3 which can be downloaded from here:
    https://osdatahub.os.uk/downloads/open/OpenUPRN
    This dataset contains a lookup of UPRNs to coordinates.
    Specs for this dataset can be found here:
    https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
    """
    def __init__(self, path, bucket, uprns=None):
        self.path = path
        self.bucket = bucket
        self.uprns = [int(x) for x in uprns] if uprns else None
        self.data = None
        # This will be stored in S3 and will be the complete list of filenames
        # We'll then use this to determine which file the UPRN's data is contained in
        self.filenames = None
    def read(self):
        """
        This methodology is placeholder, while data sits localls
        :return:
        """
        logger.info("Reading in open uprn data")
        df = pd.read_csv(
            read_io_from_s3(
                bucket_name=self.bucket,
                file_key=self.path
            )
        )
        if self.uprns:
            df = df[df["UPRN"].isin(self.uprns)]
        self.data = df
    def read_local(self):
        """
        For local testing
        :return:
        """
        logger.info("Reading in open uprn data")
        df = pd.read_csv(self.path)
        if self.uprns:
            df = df[df["UPRN"].isin(self.uprns)]
        self.data = df
    def create_file_partitions(self, partition_size=50000):
        logger.info("Sorting data by UPRN ascending")
        self.data = self.data.sort_values("UPRN", ascending=True)
        logger.info("Creating partitions")
        self.data['partition'] = self.data.index // partition_size
        self.filenames = {}
        for partition, group in tqdm(self.data.groupby('partition')):
            min_uprn = group['UPRN'].min()
            max_uprn = group['UPRN'].max()
            self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
        self.data['filename'] = self.data['partition'].map(self.filenames)
    @staticmethod
    def find_filename_for_uprn(uprn, filenames):
        for filename in filenames:
            min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
            if min_uprn <= uprn <= max_uprn:
                return filename
        return None
    @staticmethod
    def convert_bng_data_to_gpd(df):
        gpd_data = gpd.GeoDataFrame(
            df,
            geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
            crs="EPSG:27700"  # British National Grid
        )
        return gpd_data
    def save_filenames_to_s3(self, bucket_name):
        """
        Save the filenames to s3
        :param bucket_name:
        :return:
        """
        file_key = os.path.join("spatial", "filename_meta.parquet")
        filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
        filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
            '(\d+)_(\d+)'
        )
        filenames['lower'] = filenames['lower'].astype(int)
        filenames['upper'] = filenames['upper'].astype(int)
        logger.info("Saving filenames to s3 at {}".format(file_key))
        save_dataframe_to_s3_parquet(
            df=filenames,
            file_key=file_key,
            bucket_name=bucket_name
        )
--- a/etl/spatial/README.md
+++ b/etl/spatial/README.md
@ -0,0 +1,48 @@
 # Spatial - Geospatial Data Processing Service
 ## Overview
 The Spatial service is designed to read, process, and analyze geospatial data related to
 conservation areas and special buildings. It uses datasets from Historic England and the
 UK government to determine whether a given UPRN (Unique Property Reference Number) is within
 a conservation area or is a listed building. The processed data is saved back to an S3 bucket
 in a parquet format for easy retrieval and further analysis.
 ## Dependencies
 Dependencies are listed in requirements.txt. To install them, run:
 ```
 pip install -r requirements.txt
 ```
 ## Data Sources
 1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
 2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
 3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
 4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
 5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
 ## Files
 - app.py: Main application file that orchestrates the data processing flow.
 - ConservationAreaClient.py: Handles reading and processing of conservation area data.
 - OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
 - SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
 - requirements.txt: Lists all Python package dependencies.
 ## How to Run
 1. Make sure you have all the required packages installed.
 2. Update the S3 bucket and file path constants in app.py.
 3. Run app.py.
 ## Workflow
 1. Read the datasets for conservation areas and special buildings.
 2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
 3. For each partition:
    - Convert UPRN data to geopandas DataFrame.
    - Check if each UPRN is within a conservation area or is a special building.
    - Save the processed data back to S3 in parquet format.
--- a/etl/spatial/SpecialBuildingsClient.py
+++ b/etl/spatial/SpecialBuildingsClient.py
@ -0,0 +1,114 @@
 import geopandas as gpd
 from shapely.geometry import Point
 from utils.logger import setup_logger
 from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
 from datatypes.datatypes import OpenUprnCoordinateData
 logger = setup_logger()
 class SpecialBuildingsClient:
    """
    This class reads in data from Historic England, which can be used to determine if specific buildings are
    listed or heritage buildings
    """
    def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
        self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
        self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
        self.bucket = bucket
        self.historic_england_listed_buildings = None
        self.historic_england_heritage_buildings = None
    def read(self):
        """
        Read the data
        """
        logger.info("Reading in historic england listed buildings shapefile")
        self.historic_england_listed_buildings = read_shapefile_from_s3(
            bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
        )
        logger.info("Reading in historic england heritage buildings shapefile")
        self.historic_england_heritage_buildings = read_shapefile_from_s3(
            bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
        )
        # Convert the gov data to british national grid co-ordinates
        self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
    def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
        """
        Check if a location specified by British National Grid coordinates is a listed building.
        :param coordinates: dictionary, which should have the OpenUprnCoordinateData format
        :return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
        """
        # Convert the coordinates to a Shapely Point object
        point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
        # Check if the point is within any of the listed building polygons
        within_listed_buildings = self.historic_england_listed_buildings.contains(point)
        if within_listed_buildings.any():
            # If the point is within any listed building polygon, log the names of the buildings and return
            # "listed_building"
            names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
            logger.info(f"The location is within the following listed buildings: {names.values}")
            return True
        # If the point is not within any listed building polygon, return "not_listed_building"
        return False
    def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
        # Check against historic England listed buildings data
        joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
        # Identify where we have matches
        uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
        # Populate the results in the input GeoDataFrame
        uprn_gdf['is_listed_building'] = False
        uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
        return uprn_gdf
    def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
        """
        Check if a location specified by British National Grid coordinates is a heritage building at risk.
        :param coordinates: dictionary, which should have the OpenUprnCoordinateData format
        :return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
                 "not_heritage_building_at_risk" otherwise
        """
        # Convert the coordinates to a Shapely Point object
        point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
        # Check if the point is within any of the heritage building at risk polygons
        within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
        if within_heritage_buildings_at_risk.any():
            # If the point is within any heritage building at risk polygon, log the names of the buildings and return
            # "heritage_building_at_risk"
            names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
            logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
            return True
        # If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
        return False
    def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
        # Check against historic England heritage buildings data
        joined_gdf_heritage = gpd.sjoin(
            uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
        )
        # Identify where we have matches
        uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
        # Populate the results in the input GeoDataFrame
        uprn_gdf['is_heritage_building'] = False
        uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
        return uprn_gdf
--- a/etl/spatial/init.py
+++ b/etl/spatial/init.py
--- a/etl/spatial/app.py
+++ b/etl/spatial/app.py
@ -0,0 +1,103 @@
 """
 This application reads in the open uprn data from a static location and loads it into
 our database for querying from other services
 """
 import os
 from tqdm import tqdm
 import pandas as pd
 from etl.spatial.ConservationAreaClient import ConservationAreaClient
 from etl.spatial.OpenUprnClient import OpenUprnClient
 from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
 from utils.logger import setup_logger
 from utils.s3 import save_dataframe_to_s3_parquet
 BUCKET = "retrofit-datalake-dev"
 OUTPUT_BUCKET = "retrofit-data-dev"
 HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
 GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
 OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
 HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
                                             "NHLE)/Listed_Building_polygons.shp"
 HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
    "spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
 logger = setup_logger()
 def app():
    """
    This application uses the conservation area datasets to determine if a UPRN is
    in a conservation area or now
    We use two sources of data for determining if homes are in conservation areas.
    The first is the Historic England dataset, which is a shapefile containing
    polygons of conservation areas. The second is the gov.uk dataset, which is a
    geojson file containing polygons of conservation areas.
    The Historic England dataset can be found here:
    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
    The listed building dataset is also found at Historic England at:
    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
    The hertitige buildings dataset is also found at Historic England at:
    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
    The Gov.uk dataset can be found here:
    https://www.planning.data.gov.uk/dataset/conservation-area
    The open UPRN data can be found here:
    https://osdatahub.os.uk/downloads/open/OpenUPRN
    The Office for National Statistics Postcode Lookup can be found here:
    https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
    For the moment, these data sources are downloaded manually and uploaded to S3.
    This application then processes those files and writes the results to s3
    """
    conservation_area_client = ConservationAreaClient(
        historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
        gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
        bucket=BUCKET
    )
    conservation_area_client.read()
    special_buildings_client = SpecialBuildingsClient(
        historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
        historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
        bucket=BUCKET
    )
    special_buildings_client.read()
    open_uprn_client = OpenUprnClient(
        path=OPEN_UPRN_PATHNAME,
        bucket=BUCKET
    )
    open_uprn_client.read()
    # We want to sort the data and split it into filenames on UPRN.
    # We'll split the data into chunks of 50,000
    open_uprn_client.create_file_partitions()
    logger.info("Extracting spatial data for uprn partitions")
    to_loop_over = open_uprn_client.data.groupby("filename")
    for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
        uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
        uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
        uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
        uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
        # Convert back to a regular dataframe
        uprn_gdf = uprn_gdf.drop(columns=["geometry"])
        uprn_gdf = pd.DataFrame(uprn_gdf)
        save_dataframe_to_s3_parquet(
            df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
        )
    # We finally save the filesnames to s3
    open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)
--- a/conservation_areas/requirements.txt
+++ b/conservation_areas/requirements.txt
--- a/etl/spatial/tests/test_borehole_client.py
+++ b/etl/spatial/tests/test_borehole_client.py
@ -1,5 +1,5 @@
 import pytest
-from model_data.BoreholeClient import BoreholeClient
+from etl.spatial.BoreholeClient import BoreholeClient
@pytest.fixture
--- a/etl/wall_area/init.py
+++ b/etl/wall_area/init.py
--- a/model_data/simulation_system/area_data.py
+++ b/model_data/simulation_system/area_data.py
@ -1,5 +1,5 @@
 """
-This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
+This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
 of insulation measures within homes
 """
 import os
--- a/model_data/simulation_system/requirements/area_data.txt
+++ b/model_data/simulation_system/requirements/area_data.txt
--- a/infrastructure/terraform/main.tf
+++ b/infrastructure/terraform/main.tf
@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
  publicly_accessible = true
 }
-# Set up the bucket that recieve the csv uploads of properties to be retrofit
+# Set up the bucket that recieve the csv uploads of epc to be retrofit
 module "s3_presignable_bucket" {
  source          = "./modules/s3_presignable_bucket"
  bucketname      = "retrofit-plan-inputs-${var.stage}"
--- a/input_property_list.csv
+++ b/input_property_list.csv
@ -0,0 +1,12 @@
 address,postcode,Notes,,,,
 28 Distillery Wharf,W6 9bf,,,,,
 Flat 14 Godley V C House,E2 0LP,,,,,
 49 Elderfield Road,E5 0LF,,,,,
 26 Stanhope Road,N6 5NG,,,,,
 Flat 3 Frederick Building,N1 4BD,,,,,
 Flat 4 Frederick Building,N1 4BD,,,,,
 "Flat 28, 22 Adelina Grove",E1 3BX,,,,,
 "Flat 39, 239 Long Lane",SE1 4PT,,,,,
 "1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
 "59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
 88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
--- a/model_data/README.md
+++ b/model_data/README.md
@ -1,49 +0,0 @@
 # Environment setup
 We're using conda to manage environments to circumvent the
 issues with Mac M1. This documentation will also cover Pycharm setup.
 We're working in python 3.10 so
 ```commandline
 conda create -n hestia-data python=3.10
 ```
 Then activate the environment
 ```commandline
 conda activate hestia-data
 ```
 To set up with Pycharm, run
 ```commandline
 which python
 ```    
 and grab the path to the python executable. Then in Pycharm, go to
 Settings > Project > Python Interpreter and click the gear icon
 to add a new interpreter. Select Conda and either paste the path to the python executable
 and click OK, or select the conda environment from the dropdown.
 You may need to restart Pycharm for the new interpreter to be recognised.
 To install project dependencies navigate to /model_data and run
 ```commandline
 pip install -r requirements.txt
 ```
 ### Running Tests
 If you are not in a virtual environment, activate it with
 ```commandline
 conda activate envName
 ```
 Then run
 ```commandline
 pytest --cov-config=model_data/.coveragerc --cov=model_data
 ```
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -1,650 +0,0 @@
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm
 import matplotlib.pyplot as plt
 from typing import Dict, Optional, List
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
    median_absolute_error, mean_absolute_percentage_error
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.inspection import permutation_importance
 from model_data.EpcClean import EpcClean
 from statsmodels.stats.outliers_influence import variance_inflation_factor
 from tqdm import tqdm
 from utils.logger import setup_logger
 logger = setup_logger()
 class SapModel:
    # We want to estimate for making improvements on different property components
    RESPONSE = "current-energy-efficiency"
    # We could potentially  build models by constituency to avoid having too many
    # features in the model
    BASE_FEATURES = [
        "property-type",
        "built-form",
        "construction-age-band",
        "number-habitable-rooms",
        "constituency",
        "number-heated-rooms",
        "transaction-type"
    ]
    COMPONENT_FEATURES = [
        "walls-description",
        "floor-description",
        "lighting-description",
        "roof-description",
        "mainheat-description",
        "hotwater-description",
        "main-fuel",
        "mechanical-ventilation",
        "secondheat-description",
        "energy-tariff",
        "solar-water-heating-flag",
        "photo-supply",
        "windows-description",
        "glazed-type",
        "glazed-area",
        "multi-glaze-proportion",
        # "lighting-description"  # Might not need to use this
        "low-energy-lighting",
        "number-open-fireplaces",
        "mainheatcont-description",
        "fixed-lighting-outlets-count",
        "floor-height",
        "floor-level",
        "total-floor-area",
        "extension-count",
    ]
    CATEGORICAL_COLS = [
        "property-type",
        "built-form",
        "number-habitable-rooms",
        "constituency",
        "number-heated-rooms",
        "mainheat-description",
        "hotwater-description",
        "main-fuel",
        "mechanical-ventilation",
        "secondheat-description",
        "energy-tariff",
        "solar-water-heating-flag",
        "windows-description",
        "glazed-type",
        "glazed-area",
        "construction-age-band",
        "lighting-description",
        "mainheatcont-description",
        "floor-level",
    ]
    NUMERICAL_COLUMNS = [
        "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
        "fixed-lighting-outlets-count",
        "floor-height",
        "total-floor-area",
        "extension-count",
    ]
    # For the moment, we store records of the best performing models as a benchmark for future imporvements
    BEST_FIT = {
        'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
        'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
        'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197
    }
    BEST_PREDICT = {
        'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
        'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
        'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
    }
    BEST_FINAL = {
        'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
        'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
        'Median Absolute Error': 1.9487883489495985
    }
    BUCKET_VARIABLES = [
        "number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
    ]
    def __init__(
        self, data: List[Dict],
        cleaner: EpcClean,
        test_size: Optional[float] = 0.2,
        random_state: Optional[int] = None
    ):
        self.df = pd.DataFrame(data)
        self.cleaner = cleaner
        self.random_state = random_state if random_state is not None else 42
        self.test_size = 0.2 if test_size is None else test_size
        self.model_data = None
        self.train_x = None
        self.train_y = None
        self.test_x = None
        self.test_y = None
        self.test_model = None
        self.final_model = None
        self.fit_error = None
        self.predict_error = None
        self.final_error = None
        self.worst = {
            "fit_errors": pd.DataFrame(),
            "prediction_errors": pd.DataFrame(),
            "fit_x": pd.DataFrame(),
            "prediction_x": pd.DataFrame(),
            "final_errors": pd.DataFrame(),
            "final_x": pd.DataFrame(),
        }
        self.fit_df = None
        self.predict_df = None
        self.final_fit_df = None
        self.diagnosis = {}
    def run(self, plot: bool = False) -> None:
        """
        A pipeline method to run all necessary methods in correct order.
        :param plot: Boolean to indicate whether to plot the regression
        """
        try:
            self.create_dataset()
            self.fit_model()
            if plot:
                self.plot_regression(self.fit_df)
        except Exception as e:
            logger.error("An error occurred during execution.")
            logger.error(str(e))
    def _merge_with_u_values(
        self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
    ) -> pd.DataFrame:
        """
        Utility function to merge u value data with model data
        :param model_data: Pandas dataframe which is the main modelling dataset
        :param description: Name of the description column for which we're merging u-values onto
        :param thermal_transmittance: Name of the thermal transmittance column
        :return:
        """
        u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
            ["original_description", thermal_transmittance]].rename(
            columns={thermal_transmittance: f"{description}_u_value"}
        )
        model_data = model_data.merge(
            u_values,
            how="left",
            left_on=f"{description}-description",
            right_on="original_description"
        ).drop(columns=["original_description"])
        return model_data
    def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
        """
        Appends cleaned data into the model data.
        :param model_data: Original model data.
        :return: Model data with cleaned data appended.
        """
        for description in ["walls", "floor", "roof"]:
            model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
        # lighting_proportions added separately as it doesn't use the _merge_with_u_values method
        lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
            ["original_description", "low_energy_proportion"]]
        model_data = model_data.merge(
            lighting_proportions,
            how="left",
            left_on="lighting-description",
            right_on="original_description"
        ).drop(columns=["original_description"])
        return model_data
    @staticmethod
    def _convert_transaction_type(model_data: pd.DataFrame) -> pd.DataFrame:
        """
        Converts transaction type to boolean
        :param model_data: Model data with transaction type.
        :return: Model data with converted transaction type.
        """
        model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
        model_data = model_data.drop(columns=["transaction-type"])
        return model_data
    @staticmethod
    def bucket_and_fill(df: pd.DataFrame, column_name: str, n_bins: int = 10) -> pd.DataFrame:
        """
        Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD"
        :param df: Dataframe of features to be binned
        :param column_name: Name of the column to be binned
        :param n_bins: Number of bins to use
        :return: Dataframe with binned column
        """
        # Check if the column is numerical
        if np.issubdtype(df[column_name].dtype, np.number):
            # Create a new categorical column from numerical one by binning the data
            df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
            # Replace missing data with "NO_RECORD"
            df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
            df[column_name + "_bucket"] = np.where(
                df[column_name + "_bucket"] == "nan",
                "NO_RECORD",
                df[column_name + "_bucket"]
            )
        return df
    def _clean_numericals(self, model_data):
        # Try binning numericals
        remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES]
        for col in self.BUCKET_VARIABLES:
            model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
            # If all values are missing, set all values to 0 - this column will get dropped
            if all(pd.isnull(model_data[col])):
                model_data[col + "_bucket"] = "NO_RECORD"
                continue
            model_data = self.bucket_and_fill(model_data, col)
        # Replace the data with the binned version
        model_data = model_data.drop(columns=self.BUCKET_VARIABLES)
        model_data = model_data.rename(
            columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES))
        )
        # Basic fill the rest of the columns with 0 - currenrtly this provided the best performance
        for col in remaining_numericals:
            model_data[col] = np.where(
                model_data[col] == "", "0", model_data[col]
            ).astype(float)
        return model_data
    @staticmethod
    def clean_missings(model_data: pd.DataFrame) -> pd.DataFrame:
        """
        Fills categorical missing data with sensible values
        :param model_data: Original model data.
        :return: Model data with cleaned categorical data.
        """
        # Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
        # potentially
        # a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
        model_data["mechanical-ventilation"] = np.where(
            model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
        )
        model_data["solar-water-heating-flag"] = np.where(
            model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
        )
        model_data["glazed-type"] = np.where(
            model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
        )
        model_data["glazed-area"] = np.where(
            model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
        )
        return model_data
    def create_dataset(self):
        logger.info("Creating modelling dataset")
        model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
        model_data = model_data.reset_index(drop=True)
        model_data["idx"] = model_data.index.copy()
        # Append on u-values
        model_data = self._append_cleaned_data(model_data)
        model_data = self.clean_missings(model_data)
        # Convert transaction_type
        model_data = self._convert_transaction_type(model_data)
        # Clean numerical columns
        model_data = self._clean_numericals(model_data)
        # Take just entries with U-values
        # TODO: Rather than doing this, do we want to include the estimated u-values?
        #       Since this ends up with just 2k entries
        model_data = model_data[
            ~pd.isnull(model_data["walls_u_value"]) &
            ~pd.isnull(model_data["floor_u_value"]) &
            ~pd.isnull(model_data["roof_u_value"])
            ]
        exclude_features = [
            "walls-description", "floor-description", "roof-description", "transaction-type"
        ]
        features = [
            x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
                "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
            ] if x not in exclude_features
        ]
        model_data = model_data[features]
        for col in self.CATEGORICAL_COLS:
            model_data[col] = model_data[col].astype('category')
        # Convert response
        model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
        self.model_data = model_data
    def make_training_test(self, x):
        # Split into training and test
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
            x.drop(self.RESPONSE, axis=1),
            x[self.RESPONSE],
            test_size=self.test_size,
            random_state=self.random_state
        )
    @staticmethod
    def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
        """
        Utility function to remove columns that have zero standard deviation from both test and train sets
        :param train_x: Training data to remove columns from
        :param test_x: If provided, remove the same columns from the test data
        :param threshold: float value, if the standard deviation is below this threshold, the column is considered
                             to have zero standard deviation
        :return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
        """
        # Compute standard deviations
        std_devs = train_x.std()
        # Find columns with zero or near-zero standard deviation
        zero_std_cols = std_devs[std_devs <= threshold].index
        # Drop these columns from the training data
        train_x = train_x.drop(zero_std_cols, axis=1)
        if test_x is not None:
            # Ensure the test data has the same columns
            test_x = test_x[train_x.columns]
            return train_x, test_x
        return train_x, None
    def fit_model(self):
        """
        Main function to fit the model and produce accuracy metrics
        """
        x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
        # Convert booleans to integer
        for col in x.columns:
            if x[col].dtype == bool:
                x[col] = x[col].astype(int)
            if x[col].dtype == object:
                x[col] = x[col].astype(float)
        # Create the training and test sets for each run
        self.make_training_test(x)
        self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
        logger.info("Detecting multi-collinearity in training dataset")
        self.detect_multi_collinearity()
        # Add a constant to the independent value
        train_x = sm.add_constant(self.train_x)
        test_x = sm.add_constant(self.test_x)
        train_idx = train_x["idx"].copy()
        test_idx = self.test_x["idx"].copy()
        train_x = train_x.drop(columns=["idx"])
        test_x = test_x.drop(columns=["idx"])
        logger.info("Fitting testing model")
        # make regression model
        model = sm.OLS(self.train_y, train_x)
        # fit model and print results
        self.test_model = model.fit()
        train_predictions = self.test_model.fittedvalues
        test_predictions = self.test_model.predict(test_x)
        self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
            y_true=self.train_y, y_pred=train_predictions
        )
        # Predict on new data
        self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
            y_true=self.test_y, y_pred=test_predictions
        )
        fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
        predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
        self.model_data['fit'] = self.test_model.fittedvalues
        # The worst errors over index heavily for flats
        self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
        self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]
        self.fit_df = pd.DataFrame(
            {
                "fit": train_predictions,
                "actual": self.train_y,
                "idx": train_idx
            }
        ).sort_values("actual", ascending=True)
        self.predict_df = pd.DataFrame(
            {
                "predictions": test_predictions,
                "actual": self.test_y,
                "idx": test_idx
            }
        )
        self.diagnosis = {
            "fit_success": fit_success,
            "predict_success": predict_success,
            "summary": self.test_model.summary()
        }
        # We're now ready to fit the final model
        # For the momeent, the pre-processing at the top of this function merely removes columns, so we
        # just need to remove the columns that were removed from the training data from the final model
        logger.info("Fitting final model")
        x = sm.add_constant(x)
        y = x[self.RESPONSE]
        x = x[self.train_x.columns]
        idx = x["idx"].copy()
        x = x.drop(columns=["idx"])
        final_model = sm.OLS(y, x)
        # fit model and print results
        self.final_model = final_model.fit()
        final_predictions = self.final_model.fittedvalues
        self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
            y_true=y, y_pred=final_predictions
        )
        self.final_fit_df = pd.DataFrame(
            {
                "fit": final_predictions,
                "actual": y,
                "idx": idx
            }
        ).sort_values("actual", ascending=True)
    @staticmethod
    def check_successes(experiment_error, best_error):
        """
        Simple function to check if the experiment error is better than the best error
        :param experiment_error:    output of calculate_regression_metrics() on the experiment
        :param best_error:          Current benchmark best error
        :return:
        """
        successes = []
        for k in experiment_error:
            if k in ["Explained Variance Score", "R2 Score"]:
                # We want to maximise this so we want experiment error to be higher
                successes.append(
                    {
                        "measure": k,
                        "success": experiment_error[k] >= best_error[k],
                        "difference": abs(experiment_error[k] - best_error[k])
                    }
                )
                continue
            successes.append(
                {
                    "measure": k,
                    "success": experiment_error[k] <= best_error[k],
                    "difference": abs(experiment_error[k] - best_error[k])
                }
            )
        return pd.DataFrame(successes)
    def rf_importance(self, train_x, train_y, test_x, test_y):
        """
        Utility function to estimate feature importance using a random forest
        This is useful to get a sense of some of the key features which are driving model
        performance
        :param train_x: Training data covariates to build the importance model on
        :param train_y: Training data response to build the importance model on
        :param test_x:  Test data covariates to build the permutation importance model on
        :param test_y:  Test data response to build the permutation importance model on
        :return: Pandas dataframe of feature importances, ranked by most important to least
        """
        rf = RandomForestRegressor(random_state=self.random_state)
        rf.fit(train_x, train_y)
        # Print the name and importance of each feature
        rf_importance_df = []
        for feature, importance in zip(train_x.columns, rf.feature_importances_):
            rf_importance_df.append(
                {
                    "Feature": feature,
                    "rf_importance": importance
                }
            )
        rf_importance_df = pd.DataFrame(rf_importance_df)
        rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False)
        perm_importance = self.permuation_importance(rf, test_x, test_y)
        return rf_importance_df, perm_importance
    @staticmethod
    def permuation_importance(rf, test_x, test_y):
        """
        Simple utility function to produce permutation importance for a given model\
        :param rf: Random forest model to calculate permutation importance for
        :param test_x: Test covariates to be used for permutation importance
        :param test_y: Test response to be used for permutation importance
        :return:
        """
        perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error')
        perm_importance_df = pd.DataFrame(
            {
                "Feature": test_x.columns,
                "perm_importance": perm_importance.importances_mean
            }
        ).sort_values(by="perm_importance", ascending=False)
        return perm_importance_df
    def detect_multi_collinearity(self):
        # Get the VIFs for each variable
        vifs = pd.DataFrame()
        vifs["features"] = self.train_x.columns
        vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))]
        # Get the features with the highest VIF
        vifs = vifs.sort_values("vif", ascending=False)
        # There are some features, we do not want to remove
        required_features = [
            "walls_u_value", "floor_u_value", "roof_u_value", "idx", "is_rdsap"
        ]
        vifs = vifs[~vifs["features"].isin(required_features)]
        drop_vifs = vifs[np.isinf(vifs["vif"])]
        # Acceptable drop variables:
        # main-fuel_Gas: mains gas
        # glazed-type_NO DATA!
        # glazed-area_NO DATA!
        self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
        self.test_x = self.test_x[self.train_x.columns]
    @staticmethod
    def plot_regression(df):
        # Extract the "fit" and "actual" columns from the dataframe
        fit = df['fit']
        actual = df['actual']
        # Create an array of x-values (assumed to be sequential integers)
        x = np.arange(len(df))
        # Plot the fit and actual data
        plt.plot(x, fit, color='red', label='Fit')
        plt.plot(x, actual, color='blue', label='Actual')
        # Set labels and title
        plt.xlabel('Index')
        plt.ylabel('Value')
        plt.title('Linear Regression - Fit vs Actual')
        # Display legend
        plt.legend()
        # Show the plot
        plt.show()
    @staticmethod
    def calculate_regression_metrics(y_true, y_pred, n=20):
        """
        Calculate the 5 most important accuracy metrics for regression.
        Args:
            y_true (array-like): Array of true target values.
            y_pred (array-like): Array of predicted target values.
        Returns:
            dict: Dictionary containing the calculated metrics.
        """
        metrics = {
            'MAPE': mean_absolute_percentage_error(y_true, y_pred),
            'Mean Squared Error': mean_squared_error(y_true, y_pred),
            'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
            'R2 Score': r2_score(y_true, y_pred),
            'Explained Variance Score': explained_variance_score(y_true, y_pred),
            'Median Absolute Error': median_absolute_error(y_true, y_pred)
        }
        errors = pd.DataFrame()
        errors['Fit'] = y_true
        errors['Actual'] = y_pred
        errors['Residual'] = errors['Actual'] - errors['Fit']
        errors['Absolute Residual'] = np.abs(errors['Residual'])
        worst_errors = errors.nlargest(n, 'Absolute Residual')
        return metrics, worst_errors
--- a/model_data/analysis/UvalueEstimations.py
+++ b/model_data/analysis/UvalueEstimations.py
@ -1,207 +0,0 @@
 import pickle
 import pandas as pd
 import numpy as np
 from model_data.EpcClean import EpcClean
 class UvalueEstimations:
    def __init__(self, data: list):
        """
        Initialize the UvalueEstimations class.
        :param data: The input data as a list of dictionaries, to be converted to a dataframe
        """
        self.data = pd.DataFrame(data)
        self.walls = None
        self.walls_decile_data = {}
        self.roofs = None
        self.floors = None
        self.floors_decile_data = {}
    def get_estimates(self, cleaner: EpcClean):
        """
        Calculate U-value estimates for walls, roofs, and floors.
        :param cleaner: An instance of the EpcClean class used for cleaning data.
        """
        self.set_walls(cleaner)
        self.set_roofs(cleaner)
        self.set_floors(cleaner)
    def set_walls(self, cleaner: EpcClean):
        """
        Set U-value estimates for walls.
        :param cleaner: An instance of the EpcClean class used for cleaning data.
        """
        walls_columns = [
            "local-authority", "property-type", "walls-description", "walls-energy-eff", "walls-env-eff", "built-form",
            "total-floor-area", "number-habitable-rooms", "number-heated-rooms"
        ]
        walls_df = self.data[self.data["walls-description"].str.contains("Average thermal transmittance")]
        # Take just the columns we want
        walls_df = walls_df[walls_columns]
        walls_df["total-floor-area"] = walls_df["total-floor-area"].astype(float)
        walls_df, decile_labels, decile_boundaries = self.classify_into_deciles(walls_df, "total-floor-area")
        # We now get the U-values
        walls_df = walls_df.merge(
            pd.DataFrame(cleaner.cleaned['walls-description'])[["original_description", "thermal_transmittance"]],
            how="left",
            right_on="original_description",
            left_on="walls-description"
        )
        u_value_summary = walls_df.groupby(
            [
                "local-authority",
                "property-type",
                "walls-energy-eff",
                "walls-env-eff",
                "built-form",
                "number-habitable-rooms",
                "number-heated-rooms",
                "total-floor-area_group"
            ],
            observed=True
        ).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
        u_value_summary.columns = [
            "local-authority",
            "property-type",
            "walls-energy-eff",
            "walls-env-eff",
            "built-form",
            "number-habitable-rooms",
            "number-heated-rooms",
            "total-floor-area_group",
            "median_thermal_transmittance",
            "n_samples"
        ]
        self.walls = u_value_summary
        self.walls_decile_data = {
            "decile_labels": decile_labels,
            "decile_boundaries": decile_boundaries
        }
    def set_roofs(self, cleaner: EpcClean):
        """
        Set U-value estimates for roofs.
        :param cleaner: An instance of the EpcClean class used for cleaning data.
        """
        pass
    def set_floors(self, cleaner: EpcClean):
        """
        Set U-value estimates for floors.
        :param cleaner: An instance of the EpcClean class used for cleaning data.
        """
        floors_columns = [
            "local-authority", "property-type", "floor-description", "floor-energy-eff", "floor-env-eff",
            "built-form",
            "total-floor-area", "number-habitable-rooms", "number-heated-rooms"
        ]
        floors_df = self.data[self.data["floor-description"].str.contains("Average thermal transmittance")]
        # Take just the columns we want
        floors_df = floors_df[floors_columns]
        floors_df["total-floor-area"] = floors_df["total-floor-area"].astype(float)
        floors_df, decile_labels, decile_boundaries = self.classify_into_deciles(floors_df, "total-floor-area")
        # We now get the U-values
        floors_df = floors_df.merge(
            pd.DataFrame(cleaner.cleaned['floor-description'])[["original_description", "thermal_transmittance"]],
            how="left",
            right_on="original_description",
            left_on="floor-description"
        )
        u_value_summary = floors_df.groupby(
            [
                "local-authority",
                "property-type",
                "floor-energy-eff",
                "floor-env-eff",
                "built-form",
                "number-habitable-rooms",
                "number-heated-rooms",
                "total-floor-area_group"
            ],
            observed=True
        ).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
        u_value_summary.columns = [
            "local-authority",
            "property-type",
            "floor-energy-eff",
            "floor-env-eff",
            "built-form",
            "number-habitable-rooms",
            "number-heated-rooms",
            "total-floor-area_group",
            "median_thermal_transmittance",
            "n_samples"
        ]
        self.floors = u_value_summary
        self.floors_decile_data = {
            "decile_labels": decile_labels,
            "decile_boundaries": decile_boundaries
        }
    @staticmethod
    def classify_into_deciles(df: pd.DataFrame, column: str) -> (pd.DataFrame, list, list):
        """
        Break a column in a Pandas DataFrame into deciles and classify new values into the existing deciles.
        :param df: The input Pandas DataFrame.
        :param column: The column name to break into deciles.
        :return: A tuple containing:
            - The DataFrame with the decile group column.
            - The list of decile labels.
            - The list of decile boundaries.
        """
        # Calculate decile boundaries
        decile_boundaries = np.percentile(df[column], np.arange(0, 101, 10))
        # Create decile labels
        decile_labels = [f"Decile {i + 1}" for i in range(10)]
        # Assign decile labels to existing values
        df[column + "_group"] = pd.cut(df[column], bins=decile_boundaries, labels=decile_labels,
                                       include_lowest=True)
        return df, decile_labels, decile_boundaries
    @staticmethod
    def classify_decile_newvalues(decile_boundaries, decile_labels, new_values: list) -> list:
        """
        Classify new values into existing deciles based on decile definitions.
        :param decile_boundaries: The list of decile boundaries.
        :param decile_labels: The list of decile labels.
        :param new_values: A list of new values to classify.
        :return: The classifications for the new values as a list.
        """
        # Classify new values based on decile definitions
        classifications = pd.cut(new_values, bins=decile_boundaries, labels=decile_labels, include_lowest=True)
        return classifications.tolist()
    def _save(self, filename):
        """
        Useful utility function to store this object, which is particularly handy for unit testing
        :return:
        """
        with open(filename, 'wb') as f:
            pickle.dump(self, f)
--- a/model_data/config.py
+++ b/model_data/config.py
@ -1,6 +0,0 @@
 import os
 from dotenv import load_dotenv
 load_dotenv(dotenv_path='model_data/.env')
 EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
--- a/model_data/downloader.py
+++ b/model_data/downloader.py
@ -1,29 +0,0 @@
 import time
 def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdown=0.1):
    offset_from = 0
    n_completed = 0
    results = []
    complete = False
    while not complete:
        if verbose:
            print("Pulling for page %s" % str(int(offset_from / page_size) + 1))
        time.sleep(slowdown)
        search_resp = client.domestic.search(params=params, offset_from=offset_from, size=page_size)
        # Note: We can only make 10k queries for a single set of search queries.
        # It might make sense to download data via zip for machine learning since we don't need this
        # data to be perfectly up to date
        if not search_resp:
            break
        n_completed += 1
        results.extend(search_resp["rows"])
        if n_completed == n_pages:
            complete = True
        else:
            offset_from += page_size
    return results
--- a/model_data/plotting/plotting_functions.py
+++ b/model_data/plotting/plotting_functions.py
@ -1,40 +0,0 @@
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 def create_heatmap_plots(data, response_var, pivot_var1, pivot_var2, order1=None, order2=None):
    """
    Create a heatmap plot based on a list of data and given variables.
    :param data: List of dictionaries, input data.
    :param response_var: String, response variable to be plotted.
    :param pivot_var1: String, first pivot variable to be used in the plot.
    :param pivot_var2: String, second pivot variable to be used in the plot.
    :param order1: List, the order of categories for pivot_var1. Optional.
    :param order2: List, the order of categories for pivot_var2. Optional.
    Returns:
        None. Displays the generated plot.
    """
    # Create a DataFrame from your list of dictionaries
    df = pd.DataFrame(data)
    # Convert the response variable column to float type if it's not already
    df[response_var] = df[response_var].astype(float)
    # Create a pivot table
    pivot = df.pivot_table(index=pivot_var1, columns=pivot_var2, values=response_var)
    # If an order is provided, reorder the pivot table
    if order1 is not None:
        pivot = pivot.reindex(order1)
    if order2 is not None:
        pivot = pivot[order2]
    # Plot the heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot, annot=True, fmt=".2f", cmap='coolwarm')
    plt.title(f"Heatmap of {response_var} by {pivot_var1} and {pivot_var2}")
    plt.show()
--- a/Show more
+++ b/Show more
`@ -1,4 +1,4 @@`
	`from model_data.utils import is_percentage_or_number, correct_spelling`	`from etl.epc_clean.utils import is_percentage_or_number, correct_spelling`


	`class TestUtils:`	`class TestUtils:`