Merge pull request #237 from Hestia-Homes/spatial-data

Spatial data
2026-07-27 23:35:01 +00:00 · 2023-10-11 12:32:37 +08:00 · 2023-10-11 12:32:37 +08:00 · b2142a7f8e
commit b2142a7f8e
parent 642a224a7b 5d0d0825b0
154 changed files with 1977 additions and 13742 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -2,12 +2,8 @@
 omit =
    *__init__*
    */tests/*
-    model_data/temp_inputs.py
-    model_data/config.py
-    model_data/__init__.py
-    model_data/app.py
-    model_data/plotting/*
    recommendations/rdsap_tables.py
-    model_data/simulation_system/*
-    model_data/cleaner_app.py
+    */config.py
+    */app.py
+    */settings.py
    backend/app/*
--- a/.github/workflows/deploy_sap_model_lambda.yml
+++ b/.github/workflows/deploy_sap_model_lambda.yml
@ -1,81 +0,0 @@
-name: Sap Model Deploy
-
-on:
-  push:
-    branches: [ dev, prod ]
-
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.10.12
-
-      - name: Install Serverless and plugins
-        run: |
-          npm install -g serverless
-          npm install -g serverless-domain-manager
-
-      - name: AWS credentials for dev
-        if: github.ref == 'refs/heads/dev'
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
-          aws-region: eu-west-2
-
-      - name: AWS credentials for prod
-        if: github.ref == 'refs/heads/prod'
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
-          aws-region: eu-west-2
-
-      - name: Set domain name
-        id: set_domain
-        run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
-
-      - name: Set ECR credentials
-        id: set_ecr_credentials
-        run: |
-          echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
-
-      - name: Setup Docker
-        uses: docker/setup-buildx-action@v1
-
-      - name: Login to ECR
-        run: |
-          aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
-
-      # Building and pushing Docker image with caching
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v3
-        with:
-          context: ./model_data/simulation_system
-          file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
-          push: true
-          tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          platform: linux/amd64
-          provenance: false
-
-      - name: Deploy to AWS Lambda via Serverless
-        env:
-          RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
-          MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
-          PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
-          DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
-          DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
-          ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
-          GITHUB_SHA: ${{ github.sha }}
-        run: |
-          # Deploy to AWS Lambda via Serverless
-          sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose
--- a/.gitignore
+++ b/.gitignore
@ -239,7 +239,8 @@ fabric.properties
 .idea/caches/build_file_checksums.ser

 # Locally stored data
-/model_data/local_data/*
+local_data/*
+/local_data/*

 *.DS_Store
 infrastructure/terraform/.terraform*
@ -261,3 +262,6 @@ model_data/simulation_system/predictions/

 .idea/Model.iml
 .idea/misc.iml
+
+adhoc
+adhoc/*
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,14 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
+  <component name="PyNamespacePackagesService">
+    <option name="namespacePackageFolders">
+      <list>
+        <option value="$MODULE_DIR$/local_data" />
+      </list>
+    </option>
+  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/model_data/BaseUtility.py
+++ b/model_data/BaseUtility.py
@ -43,7 +43,9 @@ class Definitions:
        # contained within the first of these multiple entries is being provided. As there are no restrictions on the 
        # value in this first field it means that sometimes the first field in a multiple entry description field may 
        # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
-        "NULL"
+        "NULL",
+        # We sometimes see fields populated with just an empty string.
+        ""
    }

    DATA_ANOMALY_SUBSTRINGS = {
--- a/backend/Property.py
+++ b/backend/Property.py
@ -1,9 +1,22 @@
 from datetime import datetime
 import re
+import os
+import pandas as pd
+
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
+from utils.logger import setup_logger
+from utils.s3 import read_dataframe_from_s3_parquet
 from epc_api.client import EpcClient
-from model_data.config import EPC_AUTH_TOKEN
-from model_data.BaseUtility import Definitions
+from BaseUtility import Definitions
 from recommendations.rdsap_tables import england_wales_age_band_lookup
+from recommendations.recommendation_utils import estimate_floors, estimate_perimeter, get_wall_type, estimate_wall_area
+
+ENVIRONMENT = os.environ.get('ENVIRONMENT', 'dev')
+EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
+DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT == 'dev' else None)
+
+logger = setup_logger()


 class Property(Definitions):
@ -30,17 +43,27 @@ class Property(Definitions):
    lighting = None

    coordinates = None
-    age_band = None

    def __init__(self, id, postcode, address1, epc_client=None, data=None):
        self.id = id
        self.postcode = postcode
        self.address1 = address1
        self.data = data
+        self.old_data = None
+        self.property_dimensions = None
+
+        self.uprn = None
        self.full_sap_epc = None
-        self.in_conservation_area = None
+        self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
+        self.restricted_measures = False
        self.year_built = None
        self.number_of_rooms = None
+        self.age_band = None
+        self.construction_age_band = None
+        self.number_of_floors = None
+        self.perimeter = None
+        self.wall_type = None
+        self.floor_type = None

        self.energy = None
        self.ventilation = None
@ -83,9 +106,14 @@ class Property(Definitions):
            ]
            if len(newest_response) > 1:
                raise Exception("More than one result found for this address - investigate me")
+
+            # We'll keep old EPCs in case it contains information, not present on the newest one
+            self.old_data = [epc for epc in response["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]]
+
            response["rows"] = newest_response

        self.data = response["rows"][0]
+        self.uprn = int(self.data["uprn"])

    def set_coordinates(self, coordinates):
        """
@ -127,7 +155,7 @@ class Property(Definitions):
        """

        ventilation = self.data["mechanical-ventilation"]
-        # perform some simple cleaning - when checking 300k properties, the only unique values were
+        # perform some simple cleaning - when checking 300k epc, the only unique values were
        # {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
        if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
            ventilation = None
@ -145,7 +173,7 @@ class Property(Definitions):
        - solar_pv
        This is based on the "photo-supply" field in the EPC data.

-        When checking 100k properties, either the value was "" or a stringified number
+        When checking 100k epc, either the value was "" or a stringified number
        """

        solar_pv = self.data["photo-supply"]
@ -244,11 +272,10 @@ class Property(Definitions):
        self.set_count_variables()
        self.set_heat_loss_corridor()
        self.set_mains_gas()
-        self.set_floor_height()
-        self.set_wall_area()
-        self.set_floor_area()
        self.set_age_band()

+        self.set_basic_property_dimensions()
+
        for description, attribute in cleaned.items():

            if self.data[description] in self.DATA_ANOMALY_MATCHES:
@ -262,10 +289,19 @@ class Property(Definitions):
            attributes = [
                x for x in cleaned[description] if x["original_description"] == self.data[description]
            ]
-            if len(attributes) != 1:
+            if len(attributes) > 1:
                raise ValueError("Either No attributes or multiple found for %s" % description)
+
+            if len(attributes) == 0:
+                # We attempt to perform the clean on the fly
+                cleaner_cls = all_cleaner_map[description]
+                attributes = [cleaner_cls(self.data[description]).process()]
+
            setattr(self, self.ATTRIBUTE_MAP[description], attributes[0])

+        self.set_wall_type()
+        self.set_floor_type()
+
    def set_age_band(self):
        """
        Sets a cleaned version of the age band of the property given the EPC data
@ -275,14 +311,20 @@ class Property(Definitions):
        if not self.data:
            raise ValueError("Property does not contain data")

-        self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
+        self.construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
+        self.age_band = england_wales_age_band_lookup.get(self.construction_age_band)

-    def set_is_in_conservation_area(self, in_conservation_area):
+    def set_spatial(self, spatial: pd.DataFrame):
        """
        Sets whether the property is in a conservation area given the output of the ConservationAreaClient
-        :param in_conservation_area:  string value, indicating whether the property is in a conservation area
+        :param spatial:  Dataframe, containing the spatial data for the property
        """
-        self.in_conservation_area = in_conservation_area
+        self.in_conservation_area = spatial["conservation_status"].values[0]
+        self.is_listed = spatial["is_listed_building"].values[0]
+        self.is_heritage = spatial["is_heritage_building"].values[0]
+
+        if self.in_conservation_area is True | self.is_listed is True | self.is_heritage is True:
+            self.restricted_measures = True

    def set_year_built(self):
        """
@ -349,17 +391,6 @@ class Property(Definitions):
        else:
            self.mains_gas = map[self.data["mains-gas-flag"]]

-    def set_floor_height(self):
-        """
-        Sets the floor height of the property
-        :return:
-        """
-
-        if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
-            self.floor_height = None
-        else:
-            self.floor_height = float(self.data["floor-height"])
-
    def _clean_upload_data(self, to_update):
        for k, v in to_update.items():
            if v in self.DATA_ANOMALY_MATCHES:
@ -443,21 +474,210 @@ class Property(Definitions):

        return property_details_epc

-    def set_wall_area(self):
-        """
-        This method is placeholder
-        It implements our floor area model to produce an estimate of the property's insulatable wall area
-        """
-
-        import random
-        self.insulation_wall_area = random.uniform(60, 100)
-
-    def set_floor_area(self):
-        """
-        Sets the floor area based on the EPC data
+    def get_spatial_data(self, uprn_filenames):

        """
-        # We don't know the number of floors at the moment so we're going to assume 1
-        # however this is something we'll need to use Verisk data for
+        Given a property's UPRN, this method will pull the associated spatial data from s3
+        :return:
+        """
+
+        if self.uprn is None:
+            raise ValueError("URPN is not set, run search_address_epc")
+
+        # We get the file name for the uprn
+        filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
+        if filtered_df.empty:
+            logger.warning("Could not find file containing UPRNS")
+            return None
+
+        filename = filtered_df.iloc[0]['filenames']
+
+        spatial_data = read_dataframe_from_s3_parquet(
+            bucket_name=DATA_BUCKET, file_key=f"spatial/{filename}"
+        )
+
+        spatial = spatial_data[spatial_data["UPRN"] == self.uprn]
+
+        # Pull out spatial features
+        self.set_spatial(spatial)
+
+    def _filter_property_dimensions(self, property_dimensions):
+        """
+        Will filter the property dimensions dataframe to only include the relevant rows for the property
+        :param property_dimensions:
+        :return: filtered property dimensions dataframe
+        """
+
+        result = property_dimensions[(property_dimensions["PROPERTY_TYPE"] == self.data["property-type"])]
+
+        if self.construction_age_band is not None and self.construction_age_band not in self.DATA_ANOMALY_MATCHES:
+            result = result[(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)]
+
+        if self.data["built-form"] not in self.DATA_ANOMALY_MATCHES and self.data["built-form"] in result["BUILT_FORM"]:
+            result = result[(result["BUILT_FORM"] == self.data["built-form"])]
+
+        return result[["NUMBER_HABITABLE_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]].mean()
+
+    def set_basic_property_dimensions(self):
+        """
+        This method sets the number of floors of the property, using a simple approach based on an estimate for
+        average room size, number of rooms and total floor area
+
+        It sets the perimeter of the property, using a simple approach based on an estimate for average room size,
+        number of rooms and total floor area
+
+        Also sets floor area, number of rooms, using backup cleaned values if this data is not present, based on
+        medians across the EPC data
+        :return:
+        """

        self.floor_area = float(self.data["total-floor-area"])
+
+        if not self.data["number-habitable-rooms"] or (
+            self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES
+        ):
+            if self.property_dimensions is None:
+                property_dimensions = read_dataframe_from_s3_parquet(
+                    bucket_name=DATA_BUCKET, file_key=f"property_dimensions/{self.data['local-authority']}.parquet"
+                )
+                self.property_dimensions = self._filter_property_dimensions(property_dimensions)
+
+        if not self.data["number-habitable-rooms"]:
+            self.number_of_rooms = float(self.property_dimensions["NUMBER_HABITABLE_ROOMS"].round())
+        else:
+            self.number_of_rooms = float(self.data["number-habitable-rooms"])
+
+        if self.data["property-type"] == "House":
+            self.number_of_floors = estimate_floors(self.floor_area, self.number_of_rooms)
+        elif self.data["property-type"] == "Flat":
+            self.number_of_floors = 1
+        else:
+            raise NotImplementedError("Implement me")
+
+        if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
+            self.floor_height = float(self.property_dimensions["FLOOR_HEIGHT"].round(2))
+        else:
+            self.floor_height = float(self.data["floor-height"])
+
+        self.perimeter = estimate_perimeter(
+            self.floor_area / self.number_of_floors, self.number_of_rooms / self.number_of_floors
+        )
+
+        self.insulation_wall_area = estimate_wall_area(
+            num_floors=self.number_of_floors, floor_height=self.floor_height, perimeter=self.perimeter
+        )
+
+    def set_wall_type(self):
+        """
+        This method sets the wall type of the property, using a simple approach based on the wall description
+        :return:
+        """
+        self.wall_type = get_wall_type(**self.walls)
+
+    def set_floor_type(self):
+        """
+        This method sets the floor type of the property, which is used for calculating u-values
+        :return:
+        """
+        self.floor_type = "suspended" if self.floor["is_suspended"] else "solid"
+
+    @staticmethod
+    def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None):
+        for k in component_rename_cols:
+            component_data[f"{rename_prefix}_{k}"] = component_data[k]
+
+        component_data = {
+            k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols
+        }
+
+        return component_data
+
+    def get_model_data(self):
+        """
+        This method extracts cleaned data from the property object, which is used in our machine learning models
+
+        This will use many of the cleaned properties, extracted from the epc data, or methods in DataProcessor.
+
+        For future iterations of this, we probably want to implement a singular method in DataProcessor, which can
+        be used in the etl code and in here
+
+        :return: dictionary of model data to be scored in the model
+        """
+
+        drop_cols = ["original_description", "clean_description"]
+        insulation_drop_cols = ["thermal_transmittance_unit", "is_assumed", "is_valid"]
+        insulation_rename_cols = ["thermal_transmittance", "insulation_thickness"]
+
+        walls = self._extract_component(self.walls, insulation_rename_cols, insulation_drop_cols + drop_cols, "walls")
+        roof = self._extract_component(self.roof, insulation_rename_cols, insulation_drop_cols + drop_cols, "roof")
+        floor = self._extract_component(self.floor, insulation_rename_cols, insulation_drop_cols + drop_cols, "floor")
+
+        windows = self._extract_component(self.windows, [], drop_cols + ["no_data"])
+        fuel = self._extract_component(self.main_fuel, ["tariff_type"], drop_cols + ["tariff_type"], "main-fuel")
+        main_heating = self._extract_component(self.main_heating, [], drop_cols + ["has_assumed"])
+        main_heating_controls = self._extract_component(self.main_heating_controls, [], drop_cols)
+        hotwater = self._extract_component(self.hotwater, ["tariff_type"], drop_cols + ['assumed'], "hotwater")
+
+        # We'll need to clean second heating
+        second_heating = self.data["secondheat-description"]
+
+        epc_raw_columns = [
+            'TRANSACTION_TYPE',
+            'ENERGY_TARIFF',
+            'PROPERTY_TYPE',
+            'UPRN',
+            'NUMBER_OPEN_FIREPLACES',
+            'FIXED_LIGHTING_OUTLETS_COUNT',
+            'MULTI_GLAZE_PROPORTION',
+            'MECHANICAL_VENTILATION',
+            'PHOTO_SUPPLY',
+            'LOW_ENERGY_LIGHTING',
+            'SOLAR_WATER_HEATING_FLAG',
+            'GLAZED_TYPE',
+            'CONSTITUENCY',
+            'NUMBER_HEATED_ROOMS',
+            'EXTENSION_COUNT',
+        ]
+        epc_raw_data = {
+            k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns
+        }
+
+        built_form_cleaning_map = {
+            "Flat": "Mid-Terrace",
+            "House": "Semi-Detached",
+            "Bungalow": "Detached",
+            "Maisonette": "Mid-Terrace"
+        }
+
+        built_form = self.data["built-form"]
+        if built_form in self.DATA_ANOMALY_MATCHES:
+            # TODO: If built form isn't captured, we use the most common value for that property type - we shall
+            #       improve this methodology
+            built_form = built_form_cleaning_map.get(self.data["property-type"])
+            if not built_form:
+                raise NotImplementedError("Not handled this property type when cleaning built form")
+
+        property_data = {
+            **walls,
+            **roof,
+            **floor,
+            **fuel,
+            **main_heating,
+            **main_heating_controls,
+            **hotwater,
+            **windows,
+            "SECONDHEAT_DESCRIPTION": second_heating,
+            "DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]),
+            "SAP": float(self.data["current-energy-efficiency"]),
+            "CARBON": float(self.data["co2-emissions-current"]),
+            "HEAT_DEMAND": float(self.data["energy-consumption-current"]),
+            "estimated_perimeter": self.perimeter,
+            "CONSTRUCTION_AGE_BAND": self.construction_age_band,
+            "FLOOR_HEIGHT": self.floor_height,
+            "NUMBER_HABITABLE_ROOMS": self.number_of_rooms,
+            "TOTAL_FLOOR_AREA": self.floor_area,
+            **epc_raw_data,
+            "BUILT_FORM": built_form,
+        }
+
+        return property_data
--- a/backend/app/db/functions/materials_functions.py
+++ b/backend/app/db/functions/materials_functions.py
@ -1,10 +1,17 @@
 from backend.app.db.models.materials import Material
+from functools import lru_cache


+@lru_cache(maxsize=128)
 def get_materials(session):
    """
    This function will retrieve all materials from the database.
    :return: A list of Material objects if successful, an empty list otherwise.
+
+
+    TODO: It might not be the best choice to store the materials data in a database table since thi
+          table probably won't be very large and won't be updated that often. It might be better to
+          store this data in s3 load it into memory when the app starts up. We will test this
    """

    materials = session.query(Material).filter(Material.is_active).all()
--- a/backend/app/db/models/materials.py
+++ b/backend/app/db/models/materials.py
@ -12,6 +12,7 @@ class MaterialType(enum.Enum):
    solid_floor_insulation = "solid_floor_insulation"
    external_wall_insulation = "external_wall_insulation"
    internal_wall_insulation = "internal_wall_insulation"
+    cavity_wall_insulation = "cavity_wall_insulation"


 class DepthUnit(enum.Enum):
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -1,50 +1,41 @@
-from collections import defaultdict
-from fastapi import APIRouter, Depends
-from backend.app.db.models.portfolio import rating_lookup
-from backend.app.dependencies import validate_token
-from backend.app.plan.schemas import PlanTriggerRequest
-from backend.app.utils import read_csv_from_s3
-from backend.app.config import get_settings
-from backend.Property import Property
-from epc_api.client import EpcClient
-from utils.logger import setup_logger
-from utils.s3 import read_from_s3
-from recommendations.FloorRecommendations import FloorRecommendations
-from recommendations.WallRecommendations import WallRecommendations
-from recommendations.config import UPGRADES_MAP
-from utils.uvalue_estimates import classify_decile_newvalues
-from backend.app.db.utils import row2dict
-from starlette.responses import Response
-from sqlalchemy.orm import sessionmaker
-from sqlalchemy.exc import IntegrityError, OperationalError
 from datetime import datetime
+
 import pandas as pd
-import msgpack
+from epc_api.client import EpcClient
+from fastapi import APIRouter, Depends
+from sqlalchemy.exc import IntegrityError, OperationalError
+from sqlalchemy.orm import sessionmaker
+from starlette.responses import Response

-# model apis
-from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
-
-# database interaction functions
-from backend.app.db.functions.property_functions import (
-    create_property, create_property_targets, update_property_data, create_property_details_epc
-)
+from backend.app.config import get_settings
+from backend.app.db.connection import db_engine
 from backend.app.db.functions.materials_functions import get_materials
+from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
+from backend.app.db.functions.property_functions import (
+    create_property, create_property_details_epc, create_property_targets, update_property_data
+)
 from backend.app.db.functions.recommendations_functions import (
    create_plan, create_plan_recommendations, upload_recommendations
 )
-from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
-from backend.app.db.connection import db_engine
+from backend.app.db.models.portfolio import rating_lookup
+from backend.app.dependencies import validate_token
+from backend.app.plan.schemas import PlanTriggerRequest
+from backend.app.plan.utils import (
+    create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
+)
+from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3

-from model_data.optimiser.GainOptimiser import GainOptimiser
-from model_data.optimiser.CostOptimiser import CostOptimiser
-from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
-from model_data.optimiser.optimiser_functions import prepare_input_measures
-from model_data.simulation_system.core.DataProcessor import DataProcessor
-from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON
-
-# TODO: This is placeholder until data is stored in DB
-from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
-from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
+from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
+from backend.Property import Property
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from recommendations.FloorRecommendations import FloorRecommendations
+from recommendations.optimiser.CostOptimiser import CostOptimiser
+from recommendations.optimiser.GainOptimiser import GainOptimiser
+from recommendations.optimiser.optimiser_functions import prepare_input_measures
+from recommendations.WallRecommendations import WallRecommendations
+from utils.logger import setup_logger
+from utils.s3 import read_dataframe_from_s3_parquet

 logger = setup_logger()

@ -55,147 +46,25 @@ router = APIRouter(
    responses={404: {"description": "Not found"}}
 )

-# TODO: Load this data from db
-open_uprn_data = [
-    {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
-     'LONGITUDE': -0.0540506},
-    {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
-     'LONGITUDE': -0.0498772},
-    {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
-     'LONGITUDE': -0.226392},
-    {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-     'LONGITUDE': -0.0792445},
-    {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-     'LONGITUDE': -0.0792445},
-    {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
-     'LONGITUDE': -0.0468833},
-    {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
-     'LONGITUDE': -0.1362513},
-    {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
-     'LONGITUDE': -0.0823165}
-]
-
-in_conservation_area_data = [
-    {'uprn': 6032920, 'is_in_conservation_area': 'not_in_conservation_area'},
-    {'uprn': 6038625, 'is_in_conservation_area': 'not_in_conservation_area'},
-    {'uprn': 34153991, 'is_in_conservation_area': 'unknown'},
-    {'uprn': 10008299676, 'is_in_conservation_area': 'in_conservation_area'},
-    {'uprn': 10008299677, 'is_in_conservation_area': 'in_conservation_area'},
-    {'uprn': 100021039066, 'is_in_conservation_area': 'not_in_conservation_area'},
-    {'uprn': 100021226060, 'is_in_conservation_area': 'in_conservation_area'},
-    {'uprn': 200003489276, 'is_in_conservation_area': 'in_conservation_area'}
-]
-
-# TODO: db
-floors_decile_data = {
-    'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
-                      'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 50., 56., 69., 77.6, 87., 98., 112.,
-                                                                      127., 150., 2279.]}
-
-walls_decile_data = {
-    'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
-                      'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
-                                                                      120., 2279.]}
-
-
-def filter_materials(materials):
-    materials_by_type = defaultdict(list)
-
-    for material in materials:
-        material = row2dict(material)
-        material_type = material["type"]
-        materials_by_type[material_type].append(material)
-
-    # Optionally, you can convert the defaultdict to a normal dict if desired
-    materials_by_type = dict(materials_by_type)
-
-    return materials_by_type
-
-
-def insert_temp_recommendation_id(property_recommendations):
-    """
-    Creates a temporary recommendation id which is needed for
-    filtering recommendations between default and no, after the optimiser has been
-    run
-    :param property_recommendations:  nested list of recommendations, grouped by data_types
-    :return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
-             integer inserted
-    """
-    idx = 0
-
-    for recs in property_recommendations:
-        for rec in recs:
-            rec["recommendation_id"] = idx
-            idx += 1
-
-    return property_recommendations
-
-
-def get_cleaned():
-    """
-    This function will retrieve the cleaned dataset from s3 which has the cleaned
-    descriptions for the epc dataset
-
-    This data is stored in MessagePack format and therefore needs to be decoded
-    :return:
-    """
-
-    cleaned = read_from_s3(
-        s3_file_name="cleaned_epc_data/cleaned.bson",
-        bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
-    )
-
-    cleaned = msgpack.unpackb(cleaned, raw=False)
-
-    return cleaned
-
-
-def create_recommendation_scoring_data(
-    property: Property,
-    recommendation: dict,
-    starting_epc_data: pd.DataFrame,
-    ending_epc_data: pd.DataFrame,
-    fixed_data: pd.DataFrame,
-):
-    """
-    This wrapper function prepares data to be passed to the sap model api
-    :return:
-    """
-
-    scoring_dict = {
-        "UPRN": property.data["uprn"],
-        "id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
-        "LOCAL_AUTHORITY": property.data["local-authority"],
-        **starting_epc_data.to_dict("records")[0],
-        **ending_epc_data.to_dict("records")[0],
-        **fixed_data.to_dict("records")[0]
-    }
-
-    # We update the description to indicate it's insulated
-    if recommendation["type"] == "wall_insulation":
-        scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
-    elif recommendation["type"] == "floor_insulation":
-        scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
-    else:
-        raise NotImplementedError("Implement me")
-
-    return scoring_dict
-

@router.post("/trigger")
 async def trigger_plan(body: PlanTriggerRequest):
    logger.info("Connecting to db")
    session = sessionmaker(bind=db_engine)()
-    created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    created_at = datetime.now().isoformat()

    try:
        session.begin()
        logger.info("Getting the inputs")
-        # Read in the trigger file from s3
-        bucket_name = get_settings().PLAN_TRIGGER_BUCKET
        epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
+        plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
+        uprn_filenames = read_dataframe_from_s3_parquet(
+            bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet"
+        )
+        cleaning_data = read_parquet_from_s3(
+            bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
+        )

-        plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
        input_properties = []
        for config in plan_input:
            # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
@ -228,32 +97,21 @@ async def trigger_plan(body: PlanTriggerRequest):
        if not input_properties:
            return Response(status_code=204)

-        logger.info("Getting EPC, coordinates and conservation area data")
+        logger.info("Getting EPC, and spatial data")
        for p in input_properties:
            p.search_address_epc()
            p.set_year_built()
-
-            coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
-            p.set_coordinates(coordinate_data)
-
-            in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
-                "is_in_conservation_area"
-            )
-            p.set_is_in_conservation_area(in_conservation_area)
+            p.get_spatial_data(uprn_filenames)

        # The materials data could be cached or local so we don't need to make
        # consistent requests to the backend for
        # the same data
-        # TODO: It might not be the best choice to store the materials data in a database table since thi
-        #       table probably won't be very large and won't be updated that often. It might be better to
-        #       store this data in s3 load it into memory when the app starts up. We will test this
-
        logger.info("Reading in materials and cleaned datasets")
        materials = get_materials(session)
        materials_by_type = filter_materials(materials)
        cleaned = get_cleaned()

-        logger.info("Getting components and properties recommendations")
+        logger.info("Getting components and epc recommendations")

        # TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
        #      in as a dependency and then the optimisers can take the input measures in as part of the setup() method
@ -263,34 +121,13 @@ async def trigger_plan(body: PlanTriggerRequest):
        for p in input_properties:
            property_recommendations = []

-            # For each property, classiy floor area decide
-            total_floor_area_group_decile = classify_decile_newvalues(
-                decile_boundaries=floors_decile_data["decile_boundaries"],
-                decile_labels=floors_decile_data["decile_labels"],
-                new_values=[float(p.data["total-floor-area"])],
-            )[0]
-
            # Property recommendations
            p.get_components(cleaned)

-            # This is placeholder, until the full dataset is loaded into the database and we just make a read to the
-            # database
-            floors_u_value_estimate = [
-                x for x in uvalue_estimates_floors
-                if (x['local-authority'] == p.data["local-authority"]) &
-                   (x['property-type'] == p.data["property-type"]) &
-                   (x['built-form'] == p.data["built-form"]) &
-                   (x['floor-energy-eff'] == p.data["floor-energy-eff"] if p.data[
-                                                                               "floor-energy-eff"] != 'N/A' else True) &
-                   (x['floor-env-eff'] == p.data["floor-env-eff"] if p.data["floor-env-eff"] != 'N/A' else True)
-            ]
-
            # Floor recommendations
            floor_recommender = FloorRecommendations(
                property_instance=p,
-                uvalue_estimates=floors_u_value_estimate,
-                total_floor_area_group_decile=total_floor_area_group_decile,
-                materials=materials_by_type["suspended_floor_insulation"] + materials_by_type["solid_floor_insulation"],
+                materials=materials_by_type["floor"],
            )
            floor_recommender.recommend()

@ -298,30 +135,10 @@ async def trigger_plan(body: PlanTriggerRequest):
                property_recommendations.append(floor_recommender.recommendations)

            # Wall recommendations
-            # We would make this u-value query directly to the database
-            total_floor_area_group_decile = classify_decile_newvalues(
-                decile_boundaries=walls_decile_data["decile_boundaries"],
-                decile_labels=walls_decile_data["decile_labels"],
-                new_values=[float(p.data["total-floor-area"])],
-            )[0]
-
-            # This is placeholder, until the full dataset is loaded into the database and we just make a read to the
-            # database
-            walls_u_value_estimate = [
-                x for x in uvalue_estimates_walls
-                if (x['local-authority'] == p.data["local-authority"]) &
-                   (x['property-type'] == p.data["property-type"]) &
-                   (x['built-form'] == p.data["built-form"]) &
-                   (x['walls-energy-eff'] == p.data["walls-energy-eff"] if p.data[
-                                                                               "walls-energy-eff"] != 'N/A' else True) &
-                   (x['walls-env-eff'] == p.data["walls-env-eff"] if p.data["walls-env-eff"] != 'N/A' else True)
-            ]

            wall_recomender = WallRecommendations(
                property_instance=p,
-                uvalue_estimates=walls_u_value_estimate,
-                total_floor_area_group_decile=total_floor_area_group_decile,
-                materials=materials_by_type["external_wall_insulation"] + materials_by_type["internal_wall_insulation"]
+                materials=materials_by_type["walls"]
            )
            wall_recomender.recommend()

@ -337,12 +154,8 @@ async def trigger_plan(body: PlanTriggerRequest):
            recommendations[p.id] = property_recommendations

            # Finally, we'll prepare data for predicting the impact on SAP
-            # TODO: We should use the cleaned data from get_components in the data rather than the raw
-            #       values. We should create a method in Property which takes the EPC data and inserts the cleaned
-            #       data
-
            data_processor = DataProcessor(None, newdata=True)
-            data_processor.insert_data(pd.DataFrame([p.data.copy()]))
+            data_processor.insert_data(pd.DataFrame([p.get_model_data()]))
            data_processor.pre_process()

            starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
@ -350,10 +163,10 @@ async def trigger_plan(body: PlanTriggerRequest):
            fixed_data = data_processor.get_fixed_features()

            # We update the ending record with the recommended updates and we set lodgement date to today
-            ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
+            ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at)

            for recommendations_by_type in property_recommendations:
-                for rec in recommendations_by_type:
+                for i, rec in enumerate(recommendations_by_type):
                    scoring_dict = create_recommendation_scoring_data(
                        property=p,
                        recommendation=rec,
@ -370,15 +183,6 @@ async def trigger_plan(body: PlanTriggerRequest):
        logger.info("Preparing data for scoring in sap change api")
        recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)

-        # Clean the data
-        logger.info("Reading in cleaning dataset from s3")
-        cleaning_data = read_parquet_from_s3(
-            bucket_name=get_settings().DATA_BUCKET,
-            file_key="sap_change_model/cleaning_dataset.parquet",
-        ).rename(columns={"local-authority": "LOCAL_AUTHORITY"})
-
-        # Merge the cleaning data onto recommendations_scoring_data
-
        # Perform the same cleaning as in the model
        recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
            data_to_clean=recommendations_scoring_data,
@ -386,6 +190,13 @@ async def trigger_plan(body: PlanTriggerRequest):
            cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
        ).drop(columns=["LOCAL_AUTHORITY"])

+        recommendations_scoring_data = DataProcessor.clean_missings_after_description_process(
+            recommendations_scoring_data, [
+                c for c in recommendations_scoring_data.columns if
+                ("thermal_transmittance" in c) or ("insulation_thickness" in c)
+            ]
+        )
+
        sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
        file_location = sap_change_model_api.upload_scoring_data(
            df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
@ -396,14 +207,17 @@ async def trigger_plan(body: PlanTriggerRequest):

        # Retrieve the predictions
        predictions = pd.DataFrame(
-            read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
+            read_parquet_from_s3(
+                bucket_name=get_settings().PREDICTIONS_BUCKET,
+                file_key=response["storage_filepath"].split(get_settings().PREDICTIONS_BUCKET + "/")[1]
+            )
        )

-        predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
+        predictions["predictions"] = predictions["predictions"].astype(float).round(1)
        predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)

        # Insert the predictions into the recommendations and run the optimiser
-        logger.info("Storing recommendations")
+        logger.info("Optimising recommendations")
        for property_id in recommendations.keys():

            property = [p for p in input_properties if p.id == property_id][0]
@ -411,9 +225,11 @@ async def trigger_plan(body: PlanTriggerRequest):

            for recommendations_by_type in recommendations[property_id]:
                for rec in recommendations_by_type:
-                    rec["sap_points"] = property_predictions[property_predictions["recommendation_id"] == str(
+                    new_sap = property_predictions[property_predictions["recommendation_id"] == str(
                        rec["recommendation_id"]
-                    )]["RDSAP_CHANGE"].values[0]
+                    )]["predictions"].values[0]
+
+                    rec["sap_points"] = new_sap - float(property.data["current-energy-efficiency"])

                    if rec["sap_points"] is None:
                        raise ValueError("Sap points missing")
@ -451,8 +267,6 @@ async def trigger_plan(body: PlanTriggerRequest):
            final_recommendations = [
                rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type
            ]
-            # We update recommendations[property_id]
-
            recommendations[property_id] = final_recommendations

        # 1) the property data
--- a/backend/app/plan/temp_script_for_flight.py
+++ b/backend/app/plan/temp_script_for_flight.py
@ -0,0 +1,176 @@
+from datetime import datetime
+
+import pandas as pd
+from epc_api.client import EpcClient
+from fastapi import APIRouter, Depends
+from sqlalchemy.exc import IntegrityError, OperationalError
+from sqlalchemy.orm import sessionmaker
+from starlette.responses import Response
+
+from backend.app.config import get_settings
+from backend.app.db.connection import db_engine
+from backend.app.db.functions.materials_functions import get_materials
+from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
+from backend.app.db.functions.property_functions import (
+    create_property, create_property_details_epc, create_property_targets, update_property_data
+)
+from backend.app.db.functions.recommendations_functions import (
+    create_plan, create_plan_recommendations, upload_recommendations
+)
+from backend.app.db.models.portfolio import rating_lookup
+from backend.app.dependencies import validate_token
+from backend.app.plan.schemas import PlanTriggerRequest
+from backend.app.plan.utils import (
+    create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
+)
+from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
+
+from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
+from backend.Property import Property
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from recommendations.FloorRecommendations import FloorRecommendations
+from recommendations.optimiser.CostOptimiser import CostOptimiser
+from recommendations.optimiser.GainOptimiser import GainOptimiser
+from recommendations.optimiser.optimiser_functions import prepare_input_measures
+from recommendations.WallRecommendations import WallRecommendations
+from utils.logger import setup_logger
+from utils.s3 import read_dataframe_from_s3_parquet
+
+logger = setup_logger()
+
+import pickle
+
+with open('local_data.pickle', 'rb') as f:
+    local_data = pickle.load(f)
+
+with open("property_dimensions.pickle", "rb") as f:
+    property_dimensions = pickle.load(f)
+
+with open("sap_change_dataset.pickle", "rb") as f:
+    sap_change_dataset = pickle.load(f)
+
+created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+
+plan_input = local_data["plan_input"]
+uprn_filenames = local_data["uprn_filenames"]
+local_property_data = local_data["local_property_data"]
+materials = local_data["materials"]
+materials_by_type = filter_materials(materials)
+cleaned = local_data["cleaned"]
+cleaning_data = local_data["cleaning_data"]
+
+# Need to find some proper materials
+materials_by_type["walls"] += [
+    {'id': 4, 'type': 'cavity_wall_insulation', 'description': 'Example Material 1',
+     'depths': None,
+     'depth_unit': None, 'cost': 20,
+     'cost_unit': 'gbp_sq_meter', 'r_value_per_mm': 0.0278, 'r_value_unit': 'square_meter_kelvin_per_watt',
+     'thermal_conductivity': 0.036, 'thermal_conductivity_unit': 'watt_per_meter_kelvin',
+     'link': None, 'created_at': None, 'is_active': True},
+    {'id': 10, 'type': "cavity_wall_insulation", 'description': 'Example Material 2',
+     'depths': None, 'depth_unit': None, 'cost': 25, 'cost_unit': 'gbp_sq_meter',
+     'r_value_per_mm': 0.02631579, 'r_value_unit': 'square_meter_kelvin_per_watt', 'thermal_conductivity': 0.038,
+     'thermal_conductivity_unit': 'watt_per_meter_kelvin',
+     'link': None,
+     'created_at': None, 'is_active': True}
+]
+
+epc_client = EpcClient(auth_token="NO-TOKEN")
+
+input_properties = []
+for i, config in enumerate(plan_input):
+    property_id = local_property_data[i]["id"]
+    input_properties.append(
+        Property(
+            postcode=config['postcode'],
+            address1=config['address'],
+            epc_client=epc_client,
+            id=property_id
+        )
+    )
+
+logger.info("Getting EPC, and spatial data")
+for i, p in enumerate(input_properties):
+    p.data = local_property_data[i]["data"]
+    p.uprn = local_property_data[i]["uprn"]
+    p.id = local_property_data[i]["id"]
+    p.full_sap_epc = local_property_data[i]["full_sap_epc"]
+    p.old_data = local_property_data[i]["old_data"]
+    p.is_listed = False
+    p.in_conservation_area = False
+    p.is_heritage = False
+
+    p.set_year_built()
+
+    # TODO: TESTING
+    p.data['number-habitable-rooms'] = 3
+
+recommendations = {}
+recommendations_scoring_data = []
+
+for p in input_properties:
+    property_recommendations = []
+
+    # Property recommendations
+    p.get_components(cleaned)
+
+    # Floor recommendations
+    floor_recommender = FloorRecommendations(
+        property_instance=p,
+        materials=materials_by_type["floor"],
+    )
+    floor_recommender.recommend()
+
+    if floor_recommender.recommendations:
+        property_recommendations.append(floor_recommender.recommendations)
+
+    # Wall recommendations
+
+    wall_recomender = WallRecommendations(
+        property_instance=p,
+        materials=materials_by_type["walls"]
+    )
+    wall_recomender.recommend()
+
+    if wall_recomender.recommendations:
+        property_recommendations.append(wall_recomender.recommendations)
+
+    # We insert temporary ids into the recommendations which is important for the optimiser later
+    property_recommendations = insert_temp_recommendation_id(property_recommendations)
+
+    if not property_recommendations:
+        continue
+
+    recommendations[p.id] = property_recommendations
+
+    # Finally, we'll prepare data for predicting the impact on SAP
+    # TODO: We should use the cleaned data from get_components in the data rather than the raw
+    #       values. We should create a method in Property which takes the EPC data and inserts the cleaned
+    #       data
+
+    data_processor = DataProcessor(None, newdata=True)
+    data_processor.insert_data(pd.DataFrame([p.data.copy()]))
+    data_processor.pre_process()
+
+    starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
+    ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
+    fixed_data = data_processor.get_fixed_features()
+
+    # We update the ending record with the recommended updates and we set lodgement date to today
+    ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
+
+    for recommendations_by_type in property_recommendations:
+        for rec in recommendations_by_type:
+            scoring_dict = create_recommendation_scoring_data(
+                property=p,
+                recommendation=rec,
+                starting_epc_data=starting_epc_data,
+                ending_epc_data=ending_epc_data,
+                fixed_data=fixed_data,
+            )
+
+            recommendations_scoring_data.append(scoring_dict)
+
+# cleanup
+del data_processor
--- a/backend/app/plan/utils.py
+++ b/backend/app/plan/utils.py
@ -0,0 +1,187 @@
+import pandas as pd
+from backend.Property import Property
+from collections import defaultdict
+from utils.s3 import read_from_s3
+
+from recommendations.config import UPGRADES_MAP
+from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value
+
+from backend.app.db.utils import row2dict
+from backend.app.config import get_settings
+import msgpack
+
+
+def filter_materials(materials):
+    materials_by_type = defaultdict(list)
+
+    mapping = {
+        "walls": ["internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation"],
+        "floor": ["suspended_floor_insulation", "solid_floor_insulation"]
+    }
+
+    materials = [row2dict(material) for material in materials]
+
+    for component, types in mapping.items():
+        materials_by_type[component] = [part for part in materials if part["type"] in types]
+
+    return dict(materials_by_type)
+
+
+def insert_temp_recommendation_id(property_recommendations):
+    """
+    Creates a temporary recommendation id which is needed for
+    filtering recommendations between default and no, after the optimiser has been
+    run
+    :param property_recommendations:  nested list of recommendations, grouped by data_types
+    :return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
+             integer inserted
+    """
+    idx = 0
+
+    for recs in property_recommendations:
+        for rec in recs:
+            rec["recommendation_id"] = idx
+            idx += 1
+
+    return property_recommendations
+
+
+def get_cleaned():
+    """
+    This function will retrieve the cleaned dataset from s3 which has the cleaned
+    descriptions for the epc dataset
+
+    This data is stored in MessagePack format and therefore needs to be decoded
+    :return:
+    """
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    return cleaned
+
+
+def create_recommendation_scoring_data(
+    property: Property,
+    recommendation: dict,
+    starting_epc_data: pd.DataFrame,
+    ending_epc_data: pd.DataFrame,
+    fixed_data: pd.DataFrame,
+):
+    """
+    This wrapper function prepares data to be passed to the sap model api
+    :return:
+    """
+
+    scoring_dict = {
+        "UPRN": property.data["uprn"],
+        "id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
+        "LOCAL_AUTHORITY": property.data["local-authority"],
+        **starting_epc_data.to_dict("records")[0],
+        **ending_epc_data.to_dict("records")[0],
+        **fixed_data.to_dict("records")[0]
+    }
+
+    # Set staring u-values if we don't have them
+    if not scoring_dict["walls_thermal_transmittance"]:
+        scoring_dict["walls_thermal_transmittance"] = get_wall_u_value(
+            clean_description=property.walls["clean_description"],
+            age_band=property.age_band,
+            is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
+            is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
+        )
+
+    if not scoring_dict["floor_thermal_transmittance"]:
+        scoring_dict["floor_thermal_transmittance"] = get_floor_u_value(
+            floor_type=property.floor_type,
+            area=property.floor_area,
+            perimeter=property.perimeter,
+            wall_type=property.wall_type,
+            insulation_thickness=property.floor["insulation_thickness"],
+            age_band=property.age_band,
+        )
+
+    if not scoring_dict["roof_thermal_transmittance"]:
+        scoring_dict["roof_thermal_transmittance"] = get_roof_u_value(
+            insulation_thickness=property.roof["insulation_thickness"],
+            has_dwelling_above=property.roof["has_dwelling_above"],
+            is_loft=property.roof["is_loft"],
+            is_roof_room=property.roof["is_roof_room"],
+            is_thatched=property.roof["is_thatched"],
+            age_band=property.age_band,
+            is_flat=property.roof["is_flat"],
+            is_pitched=property.roof["is_pitched"],
+            is_at_rafters=property.roof["is_at_rafters"],
+        )
+
+    for col in [
+        "walls_insulation_thickness", "floor_insulation_thickness", "roof_insulation_thickness"
+    ]:
+        if scoring_dict[col] is None:
+            scoring_dict[col] = "none"
+
+    # We update the description to indicate it's insulated
+    if recommendation["type"] == "wall_insulation":
+        # The upgrade made here is to the u-value of the walls and the description of the
+        # insulation thickness
+        scoring_dict["walls_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
+        scoring_dict["walls_insulation_thickness_ENDING"] = "above average"
+    else:
+        if not scoring_dict["walls_thermal_transmittance_ENDING"]:
+            scoring_dict["walls_thermal_transmittance_ENDING"] = get_wall_u_value(
+                clean_description=property.walls["clean_description"],
+                age_band=property.age_band,
+                is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
+                is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
+            )
+
+        if scoring_dict["walls_insulation_thickness_ENDING"] is None:
+            scoring_dict["walls_insulation_thickness_ENDING"] = "none"
+
+    # Update description to indicate it's insulate
+    if recommendation["type"] == "floor_insulation":
+
+        if len(recommendation["parts"]) > 1:
+            raise NotImplementedError("Have more than 1 floor insulation part - handle this case")
+
+        scoring_dict["floor_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
+        # We don't really see above average for this in the training data
+        scoring_dict["floor_insulation_thickness_ENDING"] = "average"
+    else:
+        if not scoring_dict["floor_thermal_transmittance_ENDING"]:
+            scoring_dict["floor_thermal_transmittance_ENDING"] = get_floor_u_value(
+                floor_type=property.floor_type,
+                area=property.floor_area,
+                perimeter=property.perimeter,
+                wall_type=property.wall_type,
+                insulation_thickness=property.floor["insulation_thickness"],
+                age_band=property.age_band,
+            )
+
+        if scoring_dict["floor_insulation_thickness_ENDING"] is None:
+            scoring_dict["floor_insulation_thickness_ENDING"] = "none"
+
+    if recommendation["type"] not in ["wall_insulation", "floor_insulation"]:
+        raise NotImplementedError("Implement me")
+
+    if not scoring_dict["roof_thermal_transmittance_ENDING"]:
+        scoring_dict["roof_thermal_transmittance_ENDING"] = get_roof_u_value(
+            insulation_thickness=property.roof["insulation_thickness"],
+            has_dwelling_above=property.roof["has_dwelling_above"],
+            is_loft=property.roof["is_loft"],
+            is_roof_room=property.roof["is_roof_room"],
+            is_thatched=property.roof["is_thatched"],
+            age_band=property.age_band,
+            is_flat=property.roof["is_flat"],
+            is_pitched=property.roof["is_pitched"],
+            is_at_rafters=property.roof["is_at_rafters"],
+        )
+
+        if scoring_dict["roof_insulation_thickness_ENDING"] is None:
+            scoring_dict["roof_insulation_thickness_ENDING"] = "none"
+
+    return scoring_dict
--- a/backend/app/plan/uvalue_estimates_floors.py
+++ b/backend/app/plan/uvalue_estimates_floors.py
--- a/backend/app/plan/uvalue_estimates_walls.py
+++ b/backend/app/plan/uvalue_estimates_walls.py
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
    if sap_points <= 0 or sap_points > 100:
        raise ValueError("SAP points should be between 1 and 100.")

-    if sap_points > 91:
+    if sap_points >= 92:
        return "A"
-    elif sap_points > 80:
+    elif sap_points >= 81:
        return "B"
-    elif sap_points > 69:
+    elif sap_points >= 69:
        return "C"
-    elif sap_points > 55:
+    elif sap_points >= 55:
        return "D"
-    elif sap_points > 39:
+    elif sap_points >= 39:
        return "E"
-    elif sap_points > 21:
+    elif sap_points >= 21:
        return "F"
    else:
        return "G"
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
    elif epc == "B":
        return 81
    elif epc == "C":
-        return 70
+        return 69
    elif epc == "D":
-        return 56
+        return 55
    elif epc == "E":
-        return 40
+        return 39
    elif epc == "F":
-        return 22
+        return 21
    elif epc == "G":
        return 1
    else:
--- a/backend/ml_models/sap_change_model/api.py
+++ b/backend/ml_models/sap_change_model/api.py
@ -62,14 +62,14 @@ class SAPChangeModelAPI:
        logger.info("Making request to sap change api")
        url = f"{self.base_url}/sapmodel/predict"
        payload = {
-            "file_location": f"s3://retrofit-data-dev/{file_location}",
+            "file_location": file_location,
            "property_id": "",  # This should get removed
            "portfolio_id": self.portfolio_id,
            "created_at": self.timestamp
        }

        try:
-            response = requests.post(url, json=payload, headers={"Content-Type": "application/json"})
+            response = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, timeout=120)

            # Check if the response status code is 2xx (success)
            response.raise_for_status()
--- a/backend/requirements/base.txt
+++ b/backend/requirements/base.txt
@ -34,4 +34,5 @@ pytz==2023.3
 mip==1.15.0
 boto3==1.28.3
 pandas==1.5.3
-pyarrow==12.0.1
+pyarrow==12.0.1
+textblob
--- a/backend/tests/test_property.py
+++ b/backend/tests/test_property.py
@ -1,15 +1,17 @@
 import pytest
-import pandas as pd
 from unittest.mock import Mock
 from epc_api.client import EpcClient
 from backend.Property import Property
-from open_uprn.OpenUprnClient import OpenUprnClient
-from model_data.EpcClean import EpcClean
+from etl.epc_clean.EpcClean import EpcClean

 # Define some test data
 mock_epc_response = {
    "rows": [
        {
+            "lmk-key": 1,
+            "uprn": 1,
+            "number-habitable-rooms": 5,
+            "property-type": "House",
            "inspection-date": "2023-06-01",
            "some-other-key": "some-value",
            "roof-description": "Roof Description",
@ -34,6 +36,10 @@ mock_epc_response = {
            "construction-age-band": "England and Wales: 1967-1975"
        },
        {
+            "lmk-key": 2,
+            "uprn": 2,
+            "number-habitable-rooms": 5,
+            "property-type": "House",
            "inspection-date": "2023-05-01",
            "some-other-key": "some-other-value",
            "roof-description": "Roof Description",
@ -63,6 +69,10 @@ mock_epc_response = {
 mock_epc_response_dupe = {
    'rows': [
        {
+            "lmk-key": 1,
+            "uprn": 1,
+            "number-habitable-rooms": 5,
+            "property-type": "House",
            'inspection-date': '2023-06-01', 'some-other-key': 'some-value', 'roof-description': 'Roof Description',
            'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
            'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
@ -83,6 +93,10 @@ mock_epc_response_dupe = {
            "construction-age-band": "England and Wales: 1967-1975"
        },
        {
+            "lmk-key": 2,
+            "uprn": 2,
+            "number-habitable-rooms": 5,
+            "property-type": "House",
            'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
            'roof-description': 'Roof Description', 'walls-description': 'Walls Description',
            'windows-description': 'Windows Description', 'mainheat-description': 'Main Heating Description',
@ -104,6 +118,10 @@ mock_epc_response_dupe = {
            "construction-age-band": "England and Wales: 1967-1975"
        },
        {
+            "lmk-key": 3,
+            "uprn": 3,
+            "number-habitable-rooms": 5,
+            "property-type": "House",
            'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
            'roof-description': 'Roof Description',
            'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
@ -130,7 +148,7 @@ mock_epc_response_dupe = {

 class TestProperty:
    @pytest.fixture(autouse=True)
-    def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
+    def property_instance(self, mock_epc_client, mock_cleaner):
        property_instance = Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
        return property_instance

@ -141,29 +159,18 @@ class TestProperty:

    @pytest.fixture
    def mock_epc_client(self):
-        mock_epc_client = Mock(spec=EpcClient())
+        mock_epc_client = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
        mock_epc_client.domestic.search.return_value = mock_epc_response.copy()
        mock_epc_client.auth_token = "mocked_auth_token"
        return mock_epc_client

    @pytest.fixture
    def mock_epc_client_dupe_data(self):
-        mock_epc_client_dupe_data = Mock(spec=EpcClient())
+        mock_epc_client_dupe_data = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
        mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy()
        mock_epc_client_dupe_data.auth_token = "mocked_auth_token"
        return mock_epc_client_dupe_data

-    @pytest.fixture
-    def mock_open_uprn_client(self):
-        mock_open_uprn_client = Mock(spec=OpenUprnClient(path=None, uprns=[12345]))
-        mock_open_uprn_client.data = pd.DataFrame(
-            [
-                {"UPRN": 12345, "longitude": 1.2345, "latitude": 2.3456},
-                {"UPRN": 12346, "longitude": 3.4567, "latitude": 4.5678}
-            ]
-        )
-        return mock_open_uprn_client
-
    @pytest.fixture
    def mock_cleaner(self):
        lighting_averages = [
@ -186,9 +193,22 @@ class TestProperty:
        )

        mock_cleaner = Mock(spec=cleaner_spec)
+
+        walls_data = {
+            "original_description": "Walls Description",
+            "is_cavity_wall": True,
+            "is_solid_brick": False,
+            "is_timber_frame": False,
+            "is_system_built": False,
+            "is_park_home": False,
+            "is_cob": False,
+            "is_sandstone_or_limestone": False,
+            "is_granite_or_whinstone": False,
+        }
+
        mock_cleaner.cleaned = {
            "roof-description": [{"original_description": "Roof Description"}],
-            "walls-description": [{"original_description": "Walls Description"}],
+            "walls-description": [walls_data],
            "windows-description": [{"original_description": "Windows Description"}],
            "mainheat-description": [{"original_description": "Main Heating Description"}],
            "hotwater-description": [{"original_description": "Hot Water Description"}],
@ -201,10 +221,10 @@ class TestProperty:
        # Should be mocked auth token
        assert inst1.epc_client.auth_token == "mocked_auth_token"

-        inst2 = Property(3, "AB12CD", "Test Address")
+        inst2 = Property(3, "AB12CD", "Test Address", epc_client=mock_epc_client)
        assert inst2.epc_client.auth_token

-        inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
+        inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"}, epc_client=mock_epc_client)
        assert inst3.data == {"some": "data"}

        data = inst3.search_address_epc()
@ -227,11 +247,23 @@ class TestProperty:

        # Verify that the components are set correctly
        assert property_instance.roof == {"original_description": "Roof Description"}
-        assert property_instance.walls == {"original_description": "Walls Description"}
+        assert property_instance.walls == {
+            "original_description": "Walls Description",
+            "is_cavity_wall": True,
+            "is_solid_brick": False,
+            "is_timber_frame": False,
+            "is_system_built": False,
+            "is_park_home": False,
+            "is_cob": False,
+            "is_sandstone_or_limestone": False,
+            "is_granite_or_whinstone": False,
+        }
        assert property_instance.windows == {"original_description": "Windows Description"}
        assert property_instance.main_heating == {"original_description": "Main Heating Description"}
        assert property_instance.hotwater == {"original_description": "Hot Water Description"}

+        assert property_instance.wall_type == "cavity"
+
    def test_get_components_without_cleaned_data(self, property_instance, mock_cleaner):
        # Modify the mock EpcClean to not have cleaned data
        mock_cleaner.cleaned = {}
--- a/backend/tests/test_sap_model_prep.py
+++ b/backend/tests/test_sap_model_prep.py
--- a/conservation_areas/app.py
+++ b/conservation_areas/app.py
@ -1,51 +0,0 @@
-"""
-This application reads in the open uprn data from a static location and loads it into
-our database for querying from other services
-"""
-
-import os
-from conservation_areas.ConservationAreaClient import ConservationAreaClient
-from datatypes.datatypes import OpenUprnCoordinateData
-
-
-def app():
-    conservation_area_client = ConservationAreaClient(
-        historic_england_path=os.path.abspath(
-            os.path.dirname(__file__)
-        ) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
-        gov_path=os.path.abspath(
-            os.path.dirname(__file__)
-        ) + "/model_data/local_data/gov-conservation-area.geojson"
-    )
-    conservation_area_client.read()
-
-    # We need to iterate through the open uprn data and check if the coordinates are in a conservation area
-    open_uprn_data = [
-        {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
-         'LONGITUDE': -0.0540506},
-        {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
-         'LONGITUDE': -0.0498772},
-        {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
-         'LONGITUDE': -0.226392},
-        {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
-         'LONGITUDE': -0.0468833},
-        {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
-         'LONGITUDE': -0.1362513},
-        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
-         'LONGITUDE': -0.0823165}
-    ]
-
-    result = [
-        {
-            "uprn": coordinates["UPRN"],
-            "is_in_conservation_area": conservation_area_client.is_in_conservation_area(
-                OpenUprnCoordinateData(**coordinates))
-        } for coordinates in
-        open_uprn_data
-    ]
-
-    # TODO: Add a method to write to the database
--- a/data_collection/README.md
+++ b/data_collection/README.md
@ -1,5 +0,0 @@
-# Data Collection
-
-This service is specifically focused on the collection of data external sources which aren't easily
-accessed via api or via downloadable data sources. For example, wages data requires a specific application to
-pull that data from websites, e.g. from Adzuna's api
--- a/data_collection/adzuna.py
+++ b/data_collection/adzuna.py
@ -1,86 +0,0 @@
-import requests
-import json
-from data_collection.config import ADZUNA_API_KEY, ADZUNA_APP_ID
-
-import pandas as pd
-import os
-import time
-from tqdm import tqdm
-
-"""
-Table of constituencies and their codes can be downloaded from the Office of National Statistics, found here:
-https://geoportal.statistics.gov.uk/datasets/ons::westminster-parliamentary-constituencies-december-2022-names-and
-codes-in-the-united-kingdom/explore
-"""
-
-constituencies = pd.read_csv(
-    os.path.abspath(
-        os.path.dirname(
-            __file__)) + "/data_collection/data/Westminster_Parliamentary_Constituencies_("
-                         "December_2022)_Names_and_Codes_in_the_United_Kingdom.csv"
-)
-
-constituencies["location_type"] = "constituency"
-
-
-def retry_api_call(job_title, location, max_retries=10):
-    for i in range(max_retries):
-        try:
-            response = get_adzuna_jobs(job_title, location)
-            return response
-        except (requests.HTTPError, requests.ConnectionError):
-            print(f"Attempt {i + 1} failed. Retrying in 2 seconds...")
-            time.sleep(2)
-    print(f"Failed after {max_retries} attempts.")
-    return None
-
-
-def get_adzuna_jobs(job_title, location):
-    base_url = "https://api.adzuna.com/v1/api/jobs"
-    country_code = "gb"
-
-    url = f"{base_url}/{country_code}/search/1"
-
-    params = {
-        "app_id": ADZUNA_APP_ID,
-        "app_key": ADZUNA_API_KEY,
-        "results_per_page": 25,
-        "what": job_title,
-        "where": location,
-        "content-type": "application/json",
-        "distance": 10
-    }
-
-    response = requests.get(url, params=params)
-    response.raise_for_status()
-
-    jobs = json.loads(response.text)
-    return jobs
-
-
-JOB_TITLES = [
-    "insulation installer", "internal wall insulation installer", "external wall insulation installer",
-    "cavity wall insulation installer", "loft insulation installer", "roof insulation installer",
-    "spray foam insulation installer", "insulation technician", "insulation engineer", "iwi insulation installer",
-    "iwi installer", "ewi insulation installer", "ewi installer", "cwi insulation installer", "cwi installer",
-]
-
-results = []
-for i, job_title in enumerate(JOB_TITLES):
-    print("Pulling job title %s of %s" % (str(i + 1), str(len(JOB_TITLES))))
-    for _, location_config in tqdm(constituencies.iterrows(), total=constituencies.shape[0]):
-
-        location = location_config["PCON22NM"]
-        jobs = retry_api_call(job_title, location)
-        time.sleep(0.5)
-        if jobs["results"]:
-            for job in jobs['results']:
-                to_append = {
-                    "job_title": job_title,
-                    "search_location": location,
-                    "search_location_code": location_config["PCON22CD"],
-                    **job
-                }
-                results.append(to_append)
-
-results_df = pd.DataFrame(results)
--- a/data_collection/config.py
+++ b/data_collection/config.py
@ -1,7 +0,0 @@
-import os
-from dotenv import load_dotenv
-
-load_dotenv(dotenv_path='data_collection/.env')
-
-ADZUNA_API_KEY = os.environ.get('ADZUNA_API_KEY')
-ADZUNA_APP_ID = os.environ.get('ADZUNA_APP_ID')
--- a/data_collection/data/.DS_Store
+++ b/data_collection/data/.DS_Store
--- a/data_collection/data/Westminster_Parliamentary_Constituencies_(December_2022)_Names_and_Codes_in_the_United_Kingdom.csv
+++ b/data_collection/data/Westminster_Parliamentary_Constituencies_(December_2022)_Names_and_Codes_in_the_United_Kingdom.csv
@ -1,651 +0,0 @@
-PCON22CD,PCON22NM,ObjectId
-E14000530,Aldershot,1
-E14000531,Aldridge-Brownhills,2
-E14000532,Altrincham and Sale West,3
-E14000533,Amber Valley,4
-E14000534,Arundel and South Downs,5
-E14000535,Ashfield,6
-E14000536,Ashford,7
-E14000537,Ashton-under-Lyne,8
-E14000538,Aylesbury,9
-E14000539,Banbury,10
-E14000540,Barking,11
-E14000541,Barnsley Central,12
-E14000542,Barnsley East,13
-E14000543,Barrow and Furness,14
-E14000544,Basildon and Billericay,15
-E14000545,Basingstoke,16
-E14000546,Bassetlaw,17
-E14000547,Bath,18
-E14000548,Batley and Spen,19
-E14000549,Battersea,20
-E14000550,Beaconsfield,21
-E14000551,Beckenham,22
-E14000552,Bedford,23
-E14000553,Bermondsey and Old Southwark,24
-E14000554,Berwick-upon-Tweed,25
-E14000555,Bethnal Green and Bow,26
-E14000556,Beverley and Holderness,27
-E14000557,Bexhill and Battle,28
-E14000558,Bexleyheath and Crayford,29
-E14000559,Birkenhead,30
-E14000560,"Birmingham, Edgbaston",31
-E14000561,"Birmingham, Erdington",32
-E14000562,"Birmingham, Hall Green",33
-E14000563,"Birmingham, Hodge Hill",34
-E14000564,"Birmingham, Ladywood",35
-E14000565,"Birmingham, Northfield",36
-E14000566,"Birmingham, Perry Barr",37
-E14000567,"Birmingham, Selly Oak",38
-E14000568,"Birmingham, Yardley",39
-E14000569,Bishop Auckland,40
-E14000570,Blackburn,41
-E14000571,Blackley and Broughton,42
-E14000572,Blackpool North and Cleveleys,43
-E14000573,Blackpool South,44
-E14000574,Blaydon,45
-E14000575,Blyth Valley,46
-E14000576,Bognor Regis and Littlehampton,47
-E14000577,Bolsover,48
-E14000578,Bolton North East,49
-E14000579,Bolton South East,50
-E14000830,Newbury,51
-E14000831,Newcastle upon Tyne Central,52
-E14000832,Newcastle upon Tyne East,53
-E14000833,Newcastle upon Tyne North,54
-E14000834,Newcastle-under-Lyme,55
-E14000835,Newton Abbot,56
-E14000836,"Normanton, Pontefract and Castleford",57
-E14000837,North Cornwall,58
-E14000838,North Devon,59
-E14000839,North Dorset,60
-E14000840,North Durham,61
-E14000841,North East Bedfordshire,62
-E14000842,North East Cambridgeshire,63
-E14000843,North East Derbyshire,64
-E14000844,North East Hampshire,65
-E14000845,North East Hertfordshire,66
-E14000846,North East Somerset,67
-E14000847,North Herefordshire,68
-E14000848,North Norfolk,69
-E14000849,North Shropshire,70
-E14000850,North Somerset,71
-E14000851,North Swindon,72
-E14000852,North Thanet,73
-E14000853,North Tyneside,74
-E14000854,North Warwickshire,75
-E14000855,North West Cambridgeshire,76
-E14000856,North West Durham,77
-E14000857,North West Hampshire,78
-E14000858,North West Leicestershire,79
-E14000859,North West Norfolk,80
-E14000860,North Wiltshire,81
-E14000861,Northampton North,82
-E14000862,Northampton South,83
-E14000863,Norwich North,84
-E14000864,Norwich South,85
-E14000865,Nottingham East,86
-E14000866,Nottingham North,87
-E14000867,Nottingham South,88
-E14000868,Nuneaton,89
-E14000869,Old Bexley and Sidcup,90
-E14000870,Oldham East and Saddleworth,91
-E14000871,Oldham West and Royton,92
-E14000872,Orpington,93
-E14000873,Oxford East,94
-E14000874,Oxford West and Abingdon,95
-E14000875,Pendle,96
-E14000876,Penistone and Stocksbridge,97
-E14000877,Penrith and The Border,98
-E14000878,Peterborough,99
-E14000879,"Plymouth, Moor View",100
-E14000580,Bolton West,101
-E14000581,Bootle,102
-E14000582,Boston and Skegness,103
-E14000583,Bosworth,104
-E14000584,Bournemouth East,105
-E14000585,Bournemouth West,106
-E14000586,Bracknell,107
-E14000587,Bradford East,108
-E14000588,Bradford South,109
-E14000589,Bradford West,110
-E14000590,Braintree,111
-E14000591,Brent Central,112
-E14000592,Brent North,113
-E14000593,Brentford and Isleworth,114
-E14000594,Brentwood and Ongar,115
-E14000595,Bridgwater and West Somerset,116
-E14000596,Brigg and Goole,117
-E14000597,"Brighton, Kemptown",118
-E14000598,"Brighton, Pavilion",119
-E14000599,Bristol East,120
-E14000600,Bristol North West,121
-E14000601,Bristol South,122
-E14000602,Bristol West,123
-E14000603,Broadland,124
-E14000604,Bromley and Chislehurst,125
-E14000605,Bromsgrove,126
-E14000606,Broxbourne,127
-E14000607,Broxtowe,128
-E14000608,Buckingham,129
-E14000609,Burnley,130
-E14000610,Burton,131
-E14000611,Bury North,132
-E14000612,Bury South,133
-E14000613,Bury St Edmunds,134
-E14000614,Calder Valley,135
-E14000615,Camberwell and Peckham,136
-E14000616,Camborne and Redruth,137
-E14000617,Cambridge,138
-E14000618,Cannock Chase,139
-E14000619,Canterbury,140
-E14000620,Carlisle,141
-E14000621,Carshalton and Wallington,142
-E14000622,Castle Point,143
-E14000623,Central Devon,144
-E14000624,Central Suffolk and North Ipswich,145
-E14000625,Charnwood,146
-E14000626,Chatham and Aylesford,147
-E14000627,Cheadle,148
-E14000628,Chelmsford,149
-E14000629,Chelsea and Fulham,150
-E14000630,Cheltenham,151
-E14000631,Chesham and Amersham,152
-E14000632,Chesterfield,153
-E14000633,Chichester,154
-E14000634,Chingford and Woodford Green,155
-E14000635,Chippenham,156
-E14000636,Chipping Barnet,157
-E14000637,Chorley,158
-E14000638,Christchurch,159
-E14000639,Cities of London and Westminster,160
-E14000640,City of Chester,161
-E14000641,City of Durham,162
-E14000642,Clacton,163
-E14000643,Cleethorpes,164
-E14000644,Colchester,165
-E14000645,Colne Valley,166
-E14000646,Congleton,167
-E14000647,Copeland,168
-E14000648,Corby,169
-E14000649,Coventry North East,170
-E14000650,Coventry North West,171
-E14000651,Coventry South,172
-E14000652,Crawley,173
-E14000653,Crewe and Nantwich,174
-E14000654,Croydon Central,175
-E14000655,Croydon North,176
-E14000656,Croydon South,177
-E14000657,Dagenham and Rainham,178
-E14000658,Darlington,179
-E14000659,Dartford,180
-E14000660,Daventry,181
-E14000661,Denton and Reddish,182
-E14000662,Derby North,183
-E14000663,Derby South,184
-E14000664,Derbyshire Dales,185
-E14000665,Devizes,186
-E14000666,Dewsbury,187
-E14000667,Don Valley,188
-E14000668,Doncaster Central,189
-E14000669,Doncaster North,190
-E14000670,Dover,191
-E14000671,Dudley North,192
-E14000672,Dudley South,193
-E14000673,Dulwich and West Norwood,194
-E14000674,Ealing Central and Acton,195
-E14000675,Ealing North,196
-E14000676,"Ealing, Southall",197
-E14000677,Easington,198
-E14000678,East Devon,199
-E14000679,East Ham,200
-E14000780,Leeds North West,201
-E14000781,Leeds West,202
-E14000782,Leicester East,203
-E14000783,Leicester South,204
-E14000784,Leicester West,205
-E14000785,Leigh,206
-E14000786,Lewes,207
-E14000787,Lewisham East,208
-E14000788,Lewisham West and Penge,209
-E14000789,"Lewisham, Deptford",210
-E14000790,Leyton and Wanstead,211
-E14000791,Lichfield,212
-E14000792,Lincoln,213
-E14000793,"Liverpool, Riverside",214
-E14000794,"Liverpool, Walton",215
-E14000795,"Liverpool, Wavertree",216
-E14000796,"Liverpool, West Derby",217
-E14000797,Loughborough,218
-E14000798,Louth and Horncastle,219
-E14000799,Ludlow,220
-E14000800,Luton North,221
-E14000801,Luton South,222
-E14000802,Macclesfield,223
-E14000803,Maidenhead,224
-E14000804,Maidstone and The Weald,225
-E14000805,Makerfield,226
-E14000806,Maldon,227
-E14000807,Manchester Central,228
-E14000808,"Manchester, Gorton",229
-E14000809,"Manchester, Withington",230
-E14000810,Mansfield,231
-E14000811,Meon Valley,232
-E14000812,Meriden,233
-E14000813,Mid Bedfordshire,234
-E14000814,Mid Derbyshire,235
-E14000815,Mid Dorset and North Poole,236
-E14000816,Mid Norfolk,237
-E14000817,Mid Sussex,238
-E14000818,Mid Worcestershire,239
-E14000819,Middlesbrough,240
-E14000820,Middlesbrough South and East Cleveland,241
-E14000821,Milton Keynes North,242
-E14000822,Milton Keynes South,243
-E14000823,Mitcham and Morden,244
-E14000824,Mole Valley,245
-E14000825,Morecambe and Lunesdale,246
-E14000826,Morley and Outwood,247
-E14000827,New Forest East,248
-E14000828,New Forest West,249
-E14000829,Newark,250
-E14000680,East Hampshire,251
-E14000681,East Surrey,252
-E14000682,East Worthing and Shoreham,253
-E14000683,East Yorkshire,254
-E14000880,"Plymouth, Sutton and Devonport",255
-E14000684,Eastbourne,256
-E14000685,Eastleigh,257
-E14000881,Poole,258
-E14000686,Eddisbury,259
-E14000882,Poplar and Limehouse,260
-E14000687,Edmonton,261
-E14000883,Portsmouth North,262
-E14000688,Ellesmere Port and Neston,263
-E14000884,Portsmouth South,264
-E14000689,Elmet and Rothwell,265
-E14000885,Preston,266
-E14000690,Eltham,267
-E14000886,Pudsey,268
-E14000691,Enfield North,269
-E14000887,Putney,270
-E14000692,"Enfield, Southgate",271
-E14000888,Rayleigh and Wickford,272
-E14000693,Epping Forest,273
-E14000889,Reading East,274
-E14000694,Epsom and Ewell,275
-E14000890,Reading West,276
-E14000695,Erewash,277
-E14000891,Redcar,278
-E14000696,Erith and Thamesmead,279
-E14000892,Redditch,280
-E14000697,Esher and Walton,281
-E14000893,Reigate,282
-E14000698,Exeter,283
-E14000894,Ribble Valley,284
-E14000699,Fareham,285
-E14000895,Richmond (Yorks),286
-E14000700,Faversham and Mid Kent,287
-E14000896,Richmond Park,288
-E14000701,Feltham and Heston,289
-E14000897,Rochdale,290
-E14000702,Filton and Bradley Stoke,291
-E14000898,Rochester and Strood,292
-E14000703,Finchley and Golders Green,293
-E14000899,Rochford and Southend East,294
-E14000704,Folkestone and Hythe,295
-E14000900,Romford,296
-E14000705,Forest of Dean,297
-E14000901,Romsey and Southampton North,298
-E14000706,Fylde,299
-E14000902,Rossendale and Darwen,300
-E14000707,Gainsborough,301
-E14000903,Rother Valley,302
-E14000904,Rotherham,303
-E14000905,Rugby,304
-E14000906,"Ruislip, Northwood and Pinner",305
-E14000907,Runnymede and Weybridge,306
-E14000908,Rushcliffe,307
-E14000909,Rutland and Melton,308
-E14000910,Saffron Walden,309
-E14000911,Salford and Eccles,310
-E14000912,Salisbury,311
-E14000913,Scarborough and Whitby,312
-E14000914,Scunthorpe,313
-E14000915,Sedgefield,314
-E14000916,Sefton Central,315
-E14000917,Selby and Ainsty,316
-E14000918,Sevenoaks,317
-E14000919,Sheffield Central,318
-E14000920,Sheffield South East,319
-E14000921,"Sheffield, Brightside and Hillsborough",320
-E14000922,"Sheffield, Hallam",321
-E14000923,"Sheffield, Heeley",322
-E14000924,Sherwood,323
-E14000925,Shipley,324
-E14000926,Shrewsbury and Atcham,325
-E14000927,Sittingbourne and Sheppey,326
-E14000928,Skipton and Ripon,327
-E14000929,Sleaford and North Hykeham,328
-E14000730,Harrogate and Knaresborough,329
-E14000731,Harrow East,330
-E14000732,Harrow West,331
-E14000733,Hartlepool,332
-E14000734,Harwich and North Essex,333
-E14000735,Hastings and Rye,334
-E14000736,Havant,335
-E14000737,Hayes and Harlington,336
-E14000738,Hazel Grove,337
-E14000739,Hemel Hempstead,338
-E14000740,Hemsworth,339
-E14000741,Hendon,340
-E14000742,Henley,341
-E14000743,Hereford and South Herefordshire,342
-E14000744,Hertford and Stortford,343
-E14000745,Hertsmere,344
-E14000746,Hexham,345
-E14000747,Heywood and Middleton,346
-E14000748,High Peak,347
-E14000749,Hitchin and Harpenden,348
-E14000750,Holborn and St Pancras,349
-E14000751,Hornchurch and Upminster,350
-E14000752,Hornsey and Wood Green,351
-E14000753,Horsham,352
-E14000754,Houghton and Sunderland South,353
-E14000755,Hove,354
-E14000756,Huddersfield,355
-E14000757,Huntingdon,356
-E14000758,Hyndburn,357
-E14000759,Ilford North,358
-E14000760,Ilford South,359
-E14000761,Ipswich,360
-E14000762,Isle of Wight,361
-E14000763,Islington North,362
-E14000764,Islington South and Finsbury,363
-E14000765,Jarrow,364
-E14000766,Keighley,365
-E14000767,Kenilworth and Southam,366
-E14000768,Kensington,367
-E14000769,Kettering,368
-E14000770,Kingston and Surbiton,369
-E14000771,Kingston upon Hull East,370
-E14000772,Kingston upon Hull North,371
-E14000773,Kingston upon Hull West and Hessle,372
-E14000774,Kingswood,373
-E14000775,Knowsley,374
-E14000776,Lancaster and Fleetwood,375
-E14000777,Leeds Central,376
-E14000778,Leeds East,377
-E14000779,Leeds North East,378
-E14000708,Garston and Halewood,379
-E14000709,Gateshead,380
-E14000710,Gedling,381
-E14000711,Gillingham and Rainham,382
-E14000712,Gloucester,383
-E14000713,Gosport,384
-E14000714,Grantham and Stamford,385
-E14000715,Gravesham,386
-E14000716,Great Grimsby,387
-E14000717,Great Yarmouth,388
-E14000718,Greenwich and Woolwich,389
-E14000719,Guildford,390
-E14000720,Hackney North and Stoke Newington,391
-E14000721,Hackney South and Shoreditch,392
-E14000722,Halesowen and Rowley Regis,393
-E14000723,Halifax,394
-E14000724,Haltemprice and Howden,395
-E14000725,Halton,396
-E14000726,Hammersmith,397
-E14000727,Hampstead and Kilburn,398
-E14000728,Harborough,399
-E14000729,Harlow,400
-E14000930,Slough,401
-E14000931,Solihull,402
-E14000932,Somerton and Frome,403
-E14000933,South Basildon and East Thurrock,404
-E14000934,South Cambridgeshire,405
-E14000935,South Derbyshire,406
-E14000936,South Dorset,407
-E14000937,South East Cambridgeshire,408
-E14000938,South East Cornwall,409
-E14000939,South Holland and The Deepings,410
-E14000940,South Leicestershire,411
-E14000941,South Norfolk,412
-E14000942,South Northamptonshire,413
-E14000943,South Ribble,414
-E14000944,South Shields,415
-E14000945,South Staffordshire,416
-E14000946,South Suffolk,417
-E14000947,South Swindon,418
-E14000948,South Thanet,419
-E14000949,South West Bedfordshire,420
-E14000950,South West Devon,421
-E14000951,South West Hertfordshire,422
-E14000952,South West Norfolk,423
-E14000953,South West Surrey,424
-E14000954,South West Wiltshire,425
-E14000955,"Southampton, Itchen",426
-E14000956,"Southampton, Test",427
-E14000957,Southend West,428
-E14000958,Southport,429
-E14000959,Spelthorne,430
-E14000960,St Albans,431
-E14000961,St Austell and Newquay,432
-E14000962,St Helens North,433
-E14000963,St Helens South and Whiston,434
-E14000964,St Ives,435
-E14000965,Stafford,436
-E14000966,Staffordshire Moorlands,437
-E14000967,Stalybridge and Hyde,438
-E14000968,Stevenage,439
-E14000969,Stockport,440
-E14000970,Stockton North,441
-E14000971,Stockton South,442
-E14000972,Stoke-on-Trent Central,443
-E14000973,Stoke-on-Trent North,444
-E14000974,Stoke-on-Trent South,445
-E14000975,Stone,446
-E14000976,Stourbridge,447
-E14000977,Stratford-on-Avon,448
-E14000978,Streatham,449
-E14000979,Stretford and Urmston,450
-E14000980,Stroud,451
-E14000981,Suffolk Coastal,452
-E14000982,Sunderland Central,453
-E14000983,Surrey Heath,454
-E14000984,Sutton and Cheam,455
-E14000985,Sutton Coldfield,456
-E14000986,Tamworth,457
-E14000987,Tatton,458
-E14000988,Taunton Deane,459
-E14000989,Telford,460
-E14000990,Tewkesbury,461
-E14000991,The Cotswolds,462
-E14000992,The Wrekin,463
-E14000993,Thirsk and Malton,464
-E14000994,Thornbury and Yate,465
-E14000995,Thurrock,466
-E14000996,Tiverton and Honiton,467
-E14000997,Tonbridge and Malling,468
-E14000998,Tooting,469
-E14000999,Torbay,470
-E14001000,Torridge and West Devon,471
-E14001001,Totnes,472
-E14001002,Tottenham,473
-E14001003,Truro and Falmouth,474
-E14001004,Tunbridge Wells,475
-E14001005,Twickenham,476
-E14001006,Tynemouth,477
-E14001007,Uxbridge and South Ruislip,478
-E14001008,Vauxhall,479
-E14001009,Wakefield,480
-E14001010,Wallasey,481
-E14001011,Walsall North,482
-E14001012,Walsall South,483
-E14001013,Walthamstow,484
-E14001014,Wansbeck,485
-E14001015,Wantage,486
-E14001016,Warley,487
-E14001017,Warrington North,488
-E14001018,Warrington South,489
-E14001019,Warwick and Leamington,490
-E14001020,Washington and Sunderland West,491
-E14001021,Watford,492
-E14001022,Waveney,493
-E14001023,Wealden,494
-E14001024,Weaver Vale,495
-E14001025,Wellingborough,496
-E14001026,Wells,497
-E14001027,Welwyn Hatfield,498
-E14001028,Wentworth and Dearne,499
-E14001029,West Bromwich East,500
-E14001030,West Bromwich West,501
-E14001031,West Dorset,502
-E14001032,West Ham,503
-E14001033,West Lancashire,504
-E14001034,West Suffolk,505
-E14001035,West Worcestershire,506
-E14001036,Westminster North,507
-E14001037,Westmorland and Lonsdale,508
-E14001038,Weston-Super-Mare,509
-E14001039,Wigan,510
-E14001040,Wimbledon,511
-E14001041,Winchester,512
-E14001042,Windsor,513
-E14001043,Wirral South,514
-E14001044,Wirral West,515
-E14001045,Witham,516
-E14001046,Witney,517
-E14001047,Woking,518
-E14001048,Wokingham,519
-E14001049,Wolverhampton North East,520
-E14001050,Wolverhampton South East,521
-E14001051,Wolverhampton South West,522
-E14001052,Worcester,523
-E14001053,Workington,524
-E14001054,Worsley and Eccles South,525
-E14001055,Worthing West,526
-E14001056,Wycombe,527
-E14001057,Wyre and Preston North,528
-E14001058,Wyre Forest,529
-E14001059,Wythenshawe and Sale East,530
-E14001060,Yeovil,531
-E14001061,York Central,532
-E14001062,York Outer,533
-N06000001,Belfast East,534
-N06000002,Belfast North,535
-N06000003,Belfast South,536
-N06000004,Belfast West,537
-N06000005,East Antrim,538
-N06000006,East Londonderry,539
-N06000007,Fermanagh and South Tyrone,540
-N06000008,Foyle,541
-N06000009,Lagan Valley,542
-N06000010,Mid Ulster,543
-N06000011,Newry and Armagh,544
-N06000012,North Antrim,545
-N06000013,North Down,546
-N06000014,South Antrim,547
-N06000015,South Down,548
-N06000016,Strangford,549
-N06000017,Upper Bann,550
-S14000050,Ochil and South Perthshire,551
-S14000051,Orkney and Shetland,552
-S14000052,Paisley and Renfrewshire North,553
-S14000053,Paisley and Renfrewshire South,554
-S14000054,Perth and North Perthshire,555
-S14000055,"Ross, Skye and Lochaber",556
-S14000056,Rutherglen and Hamilton West,557
-S14000057,Stirling,558
-S14000058,West Aberdeenshire and Kincardine,559
-S14000059,West Dunbartonshire,560
-W07000041,Ynys Môn,561
-W07000042,Delyn,562
-W07000043,Alyn and Deeside,563
-W07000044,Wrexham,564
-W07000045,Llanelli,565
-W07000046,Gower,566
-W07000047,Swansea West,567
-W07000048,Swansea East,568
-W07000049,Aberavon,569
-W07000050,Cardiff Central,570
-W07000051,Cardiff North,571
-W07000052,Rhondda,572
-W07000053,Torfaen,573
-W07000054,Monmouth,574
-W07000055,Newport East,575
-W07000056,Newport West,576
-W07000057,Arfon,577
-W07000058,Aberconwy,578
-W07000059,Clwyd West,579
-W07000060,Vale of Clwyd,580
-W07000061,Dwyfor Meirionnydd,581
-W07000062,Clwyd South,582
-W07000063,Montgomeryshire,583
-W07000064,Ceredigion,584
-W07000065,Preseli Pembrokeshire,585
-W07000066,Carmarthen West and South Pembrokeshire,586
-W07000067,Carmarthen East and Dinefwr,587
-W07000068,Brecon and Radnorshire,588
-W07000069,Neath,589
-W07000070,Cynon Valley,590
-W07000071,Merthyr Tydfil and Rhymney,591
-W07000072,Blaenau Gwent,592
-W07000073,Bridgend,593
-W07000074,Ogmore,594
-W07000075,Pontypridd,595
-W07000076,Caerphilly,596
-W07000077,Islwyn,597
-W07000078,Vale of Glamorgan,598
-W07000079,Cardiff West,599
-W07000080,Cardiff South and Penarth,600
-N06000018,West Tyrone,601
-S14000001,Aberdeen North,602
-S14000002,Aberdeen South,603
-S14000003,Airdrie and Shotts,604
-S14000004,Angus,605
-S14000005,Argyll and Bute,606
-S14000006,"Ayr, Carrick and Cumnock",607
-S14000007,Banff and Buchan,608
-S14000008,"Berwickshire, Roxburgh and Selkirk",609
-S14000009,"Caithness, Sutherland and Easter Ross",610
-S14000010,Central Ayrshire,611
-S14000011,"Coatbridge, Chryston and Bellshill",612
-S14000012,"Cumbernauld, Kilsyth and Kirkintilloch East",613
-S14000013,Dumfries and Galloway,614
-S14000014,"Dumfriesshire, Clydesdale and Tweeddale",615
-S14000015,Dundee East,616
-S14000016,Dundee West,617
-S14000017,Dunfermline and West Fife,618
-S14000018,East Dunbartonshire,619
-S14000019,"East Kilbride, Strathaven and Lesmahagow",620
-S14000020,East Lothian,621
-S14000021,East Renfrewshire,622
-S14000022,Edinburgh East,623
-S14000023,Edinburgh North and Leith,624
-S14000024,Edinburgh South,625
-S14000025,Edinburgh South West,626
-S14000026,Edinburgh West,627
-S14000027,Na h-Eileanan an Iar,628
-S14000028,Falkirk,629
-S14000029,Glasgow Central,630
-S14000030,Glasgow East,631
-S14000031,Glasgow North,632
-S14000032,Glasgow North East,633
-S14000033,Glasgow North West,634
-S14000034,Glasgow South,635
-S14000035,Glasgow South West,636
-S14000036,Glenrothes,637
-S14000037,Gordon,638
-S14000038,Inverclyde,639
-S14000039,"Inverness, Nairn, Badenoch and Strathspey",640
-S14000040,Kilmarnock and Loudoun,641
-S14000041,Kirkcaldy and Cowdenbeath,642
-S14000042,Lanark and Hamilton East,643
-S14000043,Linlithgow and East Falkirk,644
-S14000044,Livingston,645
-S14000045,Midlothian,646
-S14000046,Moray,647
-S14000047,Motherwell and Wishaw,648
-S14000048,North Ayrshire and Arran,649
-S14000049,North East Fife,650
--- a/data_collection/local_authority.py
+++ b/data_collection/local_authority.py
@ -1 +0,0 @@
-
--- a/data_collection/requirements.txt
+++ b/data_collection/requirements.txt
@ -1,4 +0,0 @@
-requests
-python-dotenv
-pandas
-tqdm
--- a/model_data/init.py
+++ b/model_data/init.py
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -1,26 +1,61 @@
 from pathlib import Path
 import numpy as np
 import pandas as pd
-from model_data.BaseUtility import Definitions
-from model_data.simulation_system.core.Settings import (
+from BaseUtility import Definitions
+from etl.epc.settings import (
    DATA_PROCESSOR_SETTINGS,
    EARLIEST_EPC_DATE,
    FULLY_GLAZED_DESCRIPTIONS,
    AVERAGE_FIXED_FEATURES,
-    FLOOR_LEVEL_MAP,
    BUILT_FORM_REMAP,
    COLUMNS_TO_MERGE_ON,
-    COMPONENT_FEATURES,
    FIXED_FEATURES,
    COLUMNTYPES,
    RDSAP_RESPONSE,
    MAX_SAP_SCORE,
    fill_na_map,
-    FIXED_DESCRIPTON_MAPPED_FEATURES
+    STARTING_SUFFIX_COMPONENT_COLS,
+    NO_SUFFIX_COMPONENT_COLS,
+    ENDING_SUFFIX_COMPONENT_COLS
 )
+from recommendations.rdsap_tables import FLOOR_LEVEL_MAP

 from typing import List

+# These lookups are used to clean the construction age band
+bounds_map = {
+    "England and Wales: before 1900": {"l": 0, "u": 1899},
+    "England and Wales: 1930-1949": {"l": 1930, "u": 1949},
+    "England and Wales: 1900-1929": {"l": 1900, "u": 1929},
+    "England and Wales: 1950-1966": {"l": 1950, "u": 1966},
+    "England and Wales: 1967-1975": {"l": 1967, "u": 1975},
+    "England and Wales: 1976-1982": {"l": 1976, "u": 1982},
+    "England and Wales: 1983-1990": {"l": 1983, "u": 1990},
+    "England and Wales: 1991-1995": {"l": 1991, "u": 1995},
+    "England and Wales: 1996-2002": {"l": 1996, "u": 2002},
+    "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
+    "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
+    "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
+}
+
+remap = {
+    "England and Wales: 2007 onwards": "England and Wales: 2007-2011"
+}
+
+expanded_map = {
+    i: [
+        label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
+    ][0] for i in range(0, 3001)
+}
+
+
+def is_int(x):
+    try:
+        int(x)
+        return True
+    except:
+        return False
+

 class DataProcessor:
    """
@ -46,66 +81,36 @@ class DataProcessor:
    def insert_data(self, data: pd.DataFrame) -> None:
        self.data = data

+    @staticmethod
+    def clean_construction_age_band(x):
+        # Firstly, we check if it's an error value
+        if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
+            return x
+
+        # Next, we check if it's a value in our map
+        if bounds_map.get(x):
+            return x
+
+        # We check if it's a standard remap value
+        remap_value = remap.get(x, None)
+        if remap_value:
+            return remap_value
+
+        # We check if it's a number
+        if is_int(x):
+            x_int = int(x)
+            return expanded_map[x_int]
+
+        raise NotImplementedError("Not handled the case for value %s" % x)
+
    def standardise_construction_age_band(self):
        """
        This function will tidy up some of the non-standard values that are populated in the construction age
        band, which is useful for cleaning
        """
-        bounds_map = {
-            "England and Wales: before 1900": {"l": 0, "u": 1899},
-            "England and Wales: 1930-1949": {"l": 1930, "u": 1949},
-            "England and Wales: 1900-1929": {"l": 1900, "u": 1929},
-            "England and Wales: 1950-1966": {"l": 1950, "u": 1966},
-            "England and Wales: 1967-1975": {"l": 1967, "u": 1975},
-            "England and Wales: 1976-1982": {"l": 1976, "u": 1982},
-            "England and Wales: 1983-1990": {"l": 1983, "u": 1990},
-            "England and Wales: 1991-1995": {"l": 1991, "u": 1995},
-            "England and Wales: 1996-2002": {"l": 1996, "u": 2002},
-            "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
-            "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
-            "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
-        }
-
-        remap = {
-            "England and Wales: 2007 onwards": "England and Wales: 2007-2011"
-        }
-
-        expanded_map = {
-            i: [
-                label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
-            ][0] for i in range(0, 3001)
-        }
-
-        def is_int(x):
-            try:
-                int(x)
-                return True
-            except:
-                return False
-
-        def clean_construction_age_band(x):
-            # Firstly, we check if it's an error value
-            if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
-                return x
-
-            # Next, we check if it's a value in our map
-            if bounds_map.get(x):
-                return x
-
-            # We check if it's a standard remap value
-            remap_value = remap.get(x, None)
-            if remap_value:
-                return remap_value
-
-            # We check if it's a number
-            if is_int(x):
-                x_int = int(x)
-                return expanded_map[x_int]
-
-            raise NotImplementedError("Not handled the case for value %s" % x)

        self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
-            lambda x: clean_construction_age_band(x)
+            lambda x: self.clean_construction_age_band(x)
        )

        self.data = self.data[
@ -157,18 +162,6 @@ class DataProcessor:
                    break
                to_index -= 1

-    def reformat_columns(self):
-        """
-        This function applies the re-formattng of columns from lower case to capitalised
-
-        When requesting the epc data from the api, the columns are lower case
-        and separated by a hyphen, whereas in the bulk download, the columns
-        are capitalised and separated by underscores. If rename_columns is True
-        we convert the columns from lower case to capitalised format
-        :return:
-        """
-        self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
-
    def pre_process(self) -> pd.DataFrame:
        """
        Load data and begin initial cleaning
@ -176,22 +169,24 @@ class DataProcessor:
        if self.data is None:
            self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])

-        if self.newdata:
-            self.reformat_columns()
-
        if not self.newdata:
            self.confine_data()

        self.remap_columns()

        # We have some non-standard construction age bands which we'll clean for matching
-        self.standardise_construction_age_band()
-        self.clean_missing_rooms()
+        if not self.newdata:
+            self.standardise_construction_age_band()
+
+            self.clean_missing_rooms()

        self.recast_df_columns(
            column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
        )
-        self.clean_multi_glaze_proportion()
+
+        if not self.newdata:
+            self.clean_multi_glaze_proportion()
+
        self.clean_photo_supply()

        if not self.newdata:
@ -203,16 +198,24 @@ class DataProcessor:
            # If we have multiple EPC records, we can try and do filling
            self.fill_na_fields()

-        self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+        if not self.newdata:
+            self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+
        # Final re-casting after data transformed and prepared
-        self.data = self.data.astype(COLUMNTYPES)
+        coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
+        self.data = self.data.astype(coltypes)

        self.na_remapping()

        return self.data

    def na_remapping(self):
-        for column, fill_value in fill_na_map.items():
+
+        fill_na_map_apply = {
+            k: v for k, v in fill_na_map.items() if k in self.data.columns
+        } if self.newdata else fill_na_map
+
+        for column, fill_value in fill_na_map_apply.items():
            self.data[column] = self.data[column].fillna(fill_value)

    def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
@ -255,7 +258,8 @@ class DataProcessor:
        data = data.replace(np.NAN, None)

        # Remap certain columns
-        data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
+        if not self.newdata:
+            data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
        data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)

        convert_to_lower = ["TRANSACTION_TYPE"]
@ -348,7 +352,7 @@ class DataProcessor:

            cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")

-            # If there still is na values, use average across all properties in consituecy
+            # If there still is na values, use average across all epc in consituecy
            cleaning_averages_filled[variable] = cleaning_averages_filled[
                variable
            ].fillna(cleaning_averages_filled[variable].mean())
@ -497,9 +501,15 @@ class DataProcessor:
        """

        if suffix not in ["_STARTING", "_ENDING"]:
-            raise Exception("Suffix should be one of _STARTING or _ENFING")
+            raise Exception("Suffix should be one of _STARTING or _ENDING")

-        return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
+        if suffix == "_STARTING":
+            starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
+            fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS].copy()
+
+            return pd.concat([starting_cols, fixed_cols], axis=1)
+
+        return self.data[ENDING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)

    def get_fixed_features(self) -> pd.DataFrame:
        """
@ -529,125 +539,33 @@ class DataProcessor:

        return df

-    @classmethod
-    def difference_data(cls, df: pd.DataFrame):
+    @staticmethod
+    def calculate_days_to(lodgement_date):

-        """
-        Given a dataframe and starting and ending columns, this function will convert the features to
-        differenced the ending subtract the starting value, which is useful for modelling the difference responces
-        """
+        if isinstance(lodgement_date, str):
+            return (
+                pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
+            ).days

-        # We ensure that the u value columns are co-erced to a numerical format
-        uvalue_columns = [col for col in df.columns if "thermal_transmittance" in col]
-        for uvalue_col in uvalue_columns:
-            df[uvalue_col] = pd.to_numeric(df[uvalue_col])
+        return (
+            pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
+        ).dt.days

-        key_columns = [
-            "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE",
-            "SAP_STARTING", "HEAT_DEMAND_STARTING",
-            "CARBON_STARTING", "UPRN", "CONSTITUENCY",
-            "SAP_ENDING", "CARBON_ENDING", "HEAT_DEMAND_ENDING",
-            "DAYS_TO_STARTING", "DAYS_TO_ENDING"
-        ]
+    @staticmethod
+    def clean_missings_after_description_process(df, ignore_cols=None):
+        missings = pd.isnull(df).sum()
+        missings = missings[missings > 0]

-        ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
+        if ignore_cols:
+            missings = missings[~missings.index.isin(ignore_cols)]

-        columns = {x for x in df.columns if x not in ignore_cols}
-
-        non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
-        non_numerical_columns = [col for col in non_numerical_columns if col in columns]
-        levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
-
-        df = pd.get_dummies(df, columns=non_numerical_columns)
-
-        # We make sure there is a starting and ending version of the column
-        diff_columns = []
-        no_diff_columns = []  # Store for debugging
-        for col in columns:
-            if "_ENDING" in col:
-                # Don't keep the endings
-                continue
+        for col in missings.index:
+            unique_values = df[col].unique()
+            if True in unique_values or False in unique_values:
+                df[col] = df[col].fillna(False)
+            if "none" in unique_values:
+                df[col] = df[col].fillna("none")
            else:
-                # We have a starting column so check if we have an ending
-                if col.replace("_STARTING", "") + "_ENDING" in columns:
-                    diff_columns.append(col)
-                else:
-                    no_diff_columns.append(col)
-
-        if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
-            raise Exception("Something went wrong, potentially missed a differencing column")
-
-        datatypes = df.dtypes
-
-        # Note: We also difference columns like floor area and floor height. We should experiement with this.
-        # Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
-        # the starting value, therefore to explain any differences in the new floor area, it may be enough to
-        # just consider the difference however we can play around with this.
-
-        # Do the differencing
-        cols_to_append = {}
-        for starting_col in diff_columns:
-
-            base_col = starting_col.replace("_STARTING", "")
-
-            if "_STARTING" in starting_col:
-                ending_col = starting_col.replace("_STARTING", "_ENDING")
-            else:
-                ending_col = starting_col + "_ENDING"
-
-            if starting_col not in non_numerical_columns:
-                cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
-                df = df.drop(columns=[starting_col, ending_col])
-                continue
-
-            level_values = list(set(levels[starting_col] + levels[ending_col]))
-
-            level_cols = []
-            for level in level_values:
-                starting_level_col = "_".join([starting_col, str(level)])
-                ending_level_col = "_".join([ending_col, str(level)])
-
-                if starting_level_col not in df.columns:
-                    # We have no starting, just ending
-                    col_type = datatypes[ending_level_col].name
-
-                    if col_type == "bool":
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int)
-                    else:
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col]
-
-                    level_cols.append(ending_level_col)
-
-                elif ending_level_col not in df.columns:
-                    # We have no ending, just starting
-                    col_type = datatypes[starting_level_col].name
-
-                    if col_type == "bool":
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int)
-                    else:
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col]
-
-                    level_cols.append(starting_level_col)
-
-                else:
-                    col_type = datatypes[starting_level_col].name
-
-                    if col_type == "bool":
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = (
-                            df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
-                        )
-                    else:
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
-
-                    level_cols.extend([starting_level_col, ending_level_col])
-
-            # Drop the columns
-            df = df.drop(columns=level_cols)
-
-        cols_to_append = pd.DataFrame(cols_to_append)
-        df = pd.concat([df, cols_to_append], axis=1)
-
-        # Perform a final coercing of string True/False columns to boolean
-        df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
+                df[col] = df[col].fillna("Unknown")

        return df
--- a/model_data/simulation_system/core/FeatureProcessor.py
+++ b/model_data/simulation_system/core/FeatureProcessor.py
--- a/model_data/analysis/init.py
+++ b/model_data/analysis/init.py
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -4,25 +4,24 @@ from tqdm import tqdm
 import msgpack

 from pathlib import Path
-from model_data.simulation_system.core.Settings import (
+from etl.epc.settings import (
    MANDATORY_FIXED_FEATURES,
    LATEST_FIELD,
    COMPONENT_FEATURES,
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
-    EARLIEST_EPC_DATE,
    CARBON_RESPONSE,
 )
-from model_data.simulation_system.core.DataProcessor import DataProcessor
-from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
+from etl.epc.DataProcessor import DataProcessor
+from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
 from recommendations.rdsap_tables import england_wales_age_band_lookup
 from recommendations.recommendation_utils import (
    get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter,
    get_wall_type
 )

-DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
+DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"


 def get_cleaned():
@ -364,21 +363,6 @@ def make_uvalues(df):
    return df


-def clean_missings_after_description_process(df):
-    missings = pd.isnull(df).sum()
-    missings = missings[missings > 0]
-    for col in missings.index:
-        unique_values = df[col].unique()
-        if True in unique_values or False in unique_values:
-            df[col] = df[col].fillna(False)
-        if "none" in unique_values:
-            df[col] = df[col].fillna("none")
-        else:
-            df[col] = df[col].fillna("Unknown")
-
-    return df
-
-
 def app():
    # Get all the files in the directory

@ -400,6 +384,8 @@ def app():
        data_processor = DataProcessor(filepath=filepath)

        df = data_processor.pre_process()
+        df[df["WALLS_DESCRIPTION"].str.contains("Cavity")]["WALLS_DESCRIPTION"].unique()
+
        cleaning_averages = data_processor.make_cleaning_averages()

        # We have some odd cases with missing constituency so we fill
@ -512,12 +498,11 @@ def app():

        # Add some temporal features - we look at the days from the standard starting point in time
        # for the starting and ending date so all records are from a fixed point
-        data_by_urpn_df["DAYS_TO_STARTING"] = (
-            pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
-        ).dt.days
-        data_by_urpn_df["DAYS_TO_ENDING"] = (
-            pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
-        ).dt.days
+        data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
+            data_by_urpn_df["LODGEMENT_DATE_STARTING"])
+
+        data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
+            data_by_urpn_df["LODGEMENT_DATE_ENDING"])

        data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])

@ -544,7 +529,7 @@ def app():
        #       Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
        #       need to

-        data_by_urpn_df = clean_missings_after_description_process(data_by_urpn_df)
+        data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)

        if pd.isnull(data_by_urpn_df).sum().sum():
            raise ValueError("Null values found in dataset after process_and_prune_desriptions")
@ -564,6 +549,12 @@ def app():

    output = pd.concat(dataset)

+    # Remove any records that have huge swings in their floor area
+    output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
+    output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
+    output = output[output["tfa_diff_prop"] < 0.5]
+    output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
+
    uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
    for uvalue_col in uvalue_columns:
        output[uvalue_col] = pd.to_numeric(output[uvalue_col])
@ -571,15 +562,7 @@ def app():
    save_dataframe_to_s3_parquet(
        df=output,
        bucket_name="retrofit-data-dev",
-        file_key="sap_change_model/dataset_without_differencing.parquet",
-    )
-
-    output = DataProcessor.difference_data(output)
-
-    save_dataframe_to_s3_parquet(
-        df=output,
-        bucket_name="retrofit-data-dev",
-        file_key="sap_change_model/dataset_with_differencing.parquet",
+        file_key="sap_change_model/dataset.parquet",
    )


--- a/etl/epc/requirements.txt
+++ b/etl/epc/requirements.txt
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -133,28 +133,6 @@ RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
 HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
 CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"

-
-def ordinal(n):
-    if 10 <= n % 100 <= 20:
-        suffix = "th"
-    else:
-        suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
-
-    return str(n) + suffix
-
-
-FLOOR_LEVEL_MAP = {
-    "Basement": -1,
-    "Ground": 0,
-    "ground floor": 0,
-    "20+": 20,
-    "21st or above": 21,
-    **{str(i).zfill(2): i for i in range(0, 21)},
-    **{ordinal(i): i for i in range(-1, 21)},
-    **{str(i): i for i in range(-1, 21)},
-    **{i: i for i in range(-1, 21)},
-}
-
 BUILT_FORM_REMAP = {
    "Enclosed End-Terrace": "End-Terrace",
    "Enclosed Mid-Terrace": "Mid-Terrace",
@ -212,10 +190,66 @@ fill_na_map = {
    "NUMBER_OPEN_FIREPLACES": 0
 }

-# After the property descriptions have been re-remapped, we expect these features to be fixed
-FIXED_DESCRIPTON_MAPPED_FEATURES = [
-    'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended',
-    'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters',
-    'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity',
-    'is_cavity_wall', 'is_thatched', 'is_to_unheated_space'
+################################################################################################
+# These are the features we need for scoring
+# We'll likely change how we do this in the future
+################################################################################################
+
+STARTING_SUFFIX_COMPONENT_COLS = [
+    "SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION",
+    "SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY",
+    "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES",
+    "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter"
+]
+NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall',
+                            'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
+                            'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone',
+                            'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation',
+                            'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended',
+                            'is_solid', 'another_property_below', 'floor_insulation_thickness',
+                            'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat',
+                            'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness',
+                            'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope',
+                            'energy_recovery',
+                            'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system',
+                            'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units',
+                            'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
+                            'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
+                            'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
+                            'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
+                            'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
+                            'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
+                            'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas',
+                            'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
+                            'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k',
+                            'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating',
+                            'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control',
+                            'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
+                            'rate_control',
+                            'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
+                            'no_individual_heating_or_community_network', 'complex_fuel_type',
+                            ]
+
+ENDING_SUFFIX_COMPONENT_COLS = [
+    'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION',
+    'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION',
+    'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT',
+    'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness',
+    'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness',
+    'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type',
+    'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features',
+    'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators',
+    'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
+    'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
+    'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
+    'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
+    'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
+    'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
+    'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs',
+    'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood',
+    'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms',
+    'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control',
+    'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
+    'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
+    'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
 ]
--- a/etl/epc_clean/EpcClean.py
+++ b/etl/epc_clean/EpcClean.py
@ -4,16 +4,16 @@ from collections import defaultdict

 import pandas as pd

-from model_data.utils import correct_spelling
-from model_data.epc_attributes.FloorAttributes import FloorAttributes
-from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
-from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
-from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
-from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
-from model_data.epc_attributes.RoofAttributes import RoofAttributes
-from model_data.epc_attributes.WallAttributes import WallAttributes
-from model_data.epc_attributes.WindowAttributes import WindowAttributes
-from model_data.epc_attributes.LightingAttributes import LightingAttributes
+from etl.epc_clean.utils import correct_spelling
+from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
+from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
+from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
+from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
+from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
+from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
+from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes


 class EpcClean:
@ -130,7 +130,7 @@ class EpcClean:
            self.cleaned[field].append(
                {
                    "original_description": description,
-                    "clean_description": cln.description.capitalize(),
+                    "clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
                    **cln.process()
                }
            )
--- a/model_data/plotting/init.py
+++ b/model_data/plotting/init.py
--- a/model_data/cleaner_app.py
+++ b/model_data/cleaner_app.py
@ -3,8 +3,8 @@ import os
 import pandas as pd
 import msgpack

-from model_data.EpcClean import EpcClean
-from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
+from etl.epc_clean.EpcClean import EpcClean
+from etl.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
 from utils.s3 import save_data_to_s3

@ -19,7 +19,7 @@ LAND_REGISTRY_PATHS = [
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
 ]

-EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
+EPC_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"

 ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")

@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
 def app():
    """
    For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
-    and produce a dataset of cleaned fields so that when we get new properties, we can quickly
+    and produce a dataset of cleaned fields so that when we get new epc, we can quickly
    sanitise any description data

    Currently, this application is just run on a local machine
@ -36,9 +36,6 @@ def app():
    cleaned_data = {}
    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
    for directory in tqdm(epc_directories):
-        directory_destructured = str(directory).split("/")[-1].split("-")
-        gss_code = directory_destructured[1]
-        local_authority = directory_destructured[2]

        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        # Rename the columns to the same format as the api returns
@ -62,14 +59,6 @@ def app():
                new_data = [x for x in data if x["original_description"] not in existing_descriptions]
                cleaned_data[k].extend(new_data)

-        # TODO: Add property age band into this
-        # uvalue_estimates = UvalueEstimations(data=data)
-        # uvalue_estimates.get_estimates(cleaner=cleaner)
-        # # TODO: Store these to a s3
-        # uvalue_estimates.walls
-        # uvalue_estimates.floors
-        # uvalue_estimates.roofs
-
    # Basic check to make sure all descriptions are unique
    for _, cleaned in cleaned_data.items():
        descriptions = [x["original_description"] for x in cleaned]
--- a/etl/epc_clean/epc_attributes/FloorAttributes.py
+++ b/etl/epc_clean/epc_attributes/FloorAttributes.py
@ -1,7 +1,7 @@
 import re
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types


 class FloorAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/HotWaterAttributes.py
+++ b/etl/epc_clean/epc_attributes/HotWaterAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword


 class HotWaterAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/LightingAttributes.py
+++ b/etl/epc_clean/epc_attributes/LightingAttributes.py
@ -1,6 +1,6 @@
 import re
-from model_data.epc_attributes.attribute_utils import clean_description
-from model_data.utils import correct_spelling
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description
+from etl.epc_clean.utils import correct_spelling


 class LightingAttributes:
@ -27,7 +27,7 @@ class LightingAttributes:
        lel_match2 = re.search(r"goleuadau ynni-isel mewn (\d+)%? o'r mannau gosod", self.description)

        if lel_match is not None or lel_match2 is not None:
-            
+
            # Perform the actual translation
            percentage = lel_match.group(1) if lel_match is not None else lel_match2.group(1)
            self.description = f"low energy lighting in {percentage}% of fixed outlets"
--- a/etl/epc_clean/epc_attributes/MainFuelAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainFuelAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword


 class MainFuelAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/MainheatAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatAttributes.py
@ -1,5 +1,5 @@
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
 from typing import Dict, Union


--- a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword


 class MainheatControlAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/RoofAttributes.py
+++ b/etl/epc_clean/epc_attributes/RoofAttributes.py
@ -1,7 +1,7 @@
 import re
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance


 class RoofAttributes(Definitions):
--- a/etl/epc_clean/epc_attributes/WallAttributes.py
+++ b/etl/epc_clean/epc_attributes/WallAttributes.py
@ -1,7 +1,7 @@
 import re
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import (
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import (
    extract_component_types,
    extract_thermal_transmittance
 )
--- a/etl/epc_clean/epc_attributes/WindowAttributes.py
+++ b/etl/epc_clean/epc_attributes/WindowAttributes.py
@ -1,6 +1,6 @@
 from typing import Dict, Union
-from model_data.BaseUtility import Definitions
-from model_data.epc_attributes.attribute_utils import clean_description
+from BaseUtility import Definitions
+from etl.epc_clean.epc_attributes.attribute_utils import clean_description


 class WindowAttributes(Definitions):
--- a/model_data/simulation_system/MLModel/init.py
+++ b/model_data/simulation_system/MLModel/init.py
--- a/etl/epc_clean/epc_attributes/all_cleaners.py
+++ b/etl/epc_clean/epc_attributes/all_cleaners.py
@ -0,0 +1,21 @@
+from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
+from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
+from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
+from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
+from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
+from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
+from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
+
+all_cleaner_map = {
+    'floor-description': FloorAttributes,
+    'hotwater-description': HotWaterAttributes,
+    'main-fuel': MainFuelAttributes,
+    'mainheat-description': MainHeatAttributes,
+    'mainheatcont-description': MainheatControlAttributes,
+    'roof-description': RoofAttributes,
+    'walls-description': WallAttributes,
+    'windows-description': WindowAttributes,
+    'lighting-description:': LightingAttributes,
+}
--- a/etl/epc_clean/epc_attributes/attribute_utils.py
+++ b/etl/epc_clean/epc_attributes/attribute_utils.py
--- a/model_data/simulation_system/init.py
+++ b/model_data/simulation_system/init.py
--- a/etl/epc_clean/tests/test_attribute_utils.py
+++ b/etl/epc_clean/tests/test_attribute_utils.py
@ -1,5 +1,5 @@
 import pytest
-import model_data.epc_attributes.attribute_utils as attribute_utils
+import etl.epc_clean.epc_attributes.attribute_utils as attribute_utils


 def test_extract_thermal_transmittance():
--- a/etl/epc_clean/tests/test_data/EpcClean_inputs.obj
+++ b/etl/epc_clean/tests/test_data/EpcClean_inputs.obj
--- a/etl/epc_clean/tests/test_data/test_floor_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_floor_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_hot_water_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_hot_water_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_lighting_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_lighting_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_main_fuel_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_main_fuel_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_mainheat_control_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_mainheat_control_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_roof_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_roof_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py
--- a/etl/epc_clean/tests/test_data/test_window_attributes_cases.py
+++ b/etl/epc_clean/tests/test_data/test_window_attributes_cases.py
--- a/etl/epc_clean/tests/test_epc_clean.py
+++ b/etl/epc_clean/tests/test_epc_clean.py
@ -1,6 +1,6 @@
 import pytest
 import pickle
-from model_data.EpcClean import EpcClean
+from etl.epc_clean.EpcClean import EpcClean
 from pathlib import Path

 # For local testing
--- a/etl/epc_clean/tests/test_floor_attributes.py
+++ b/etl/epc_clean/tests/test_floor_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.tests.test_data.test_floor_attributes_cases import clean_floor_cases
-from model_data.epc_attributes.FloorAttributes import FloorAttributes
+from etl.epc_clean.tests.test_data.test_floor_attributes_cases import clean_floor_cases
+from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes


 class TestCleanFloor:
--- a/etl/epc_clean/tests/test_hotwater_attributes.py
+++ b/etl/epc_clean/tests/test_hotwater_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
-from model_data.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
+from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
+from etl.epc_clean.tests.test_data.test_hot_water_attributes_cases import hotwater_cases


 class TestHotWaterAttributes:
--- a/etl/epc_clean/tests/test_lighting_attributes.py
+++ b/etl/epc_clean/tests/test_lighting_attributes.py
@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
-from model_data.tests.test_data.test_lighting_attributes_cases import test_cases
-from model_data.epc_attributes.LightingAttributes import LightingAttributes
+from etl.epc_clean.tests.test_data.test_lighting_attributes_cases import test_cases
+from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes

 # An example averages dataset to use in tests. It is a dictionary where the key is a lighting description and the
 # value is the expected proportion.
--- a/etl/epc_clean/tests/test_mainfuel_attributes.py
+++ b/etl/epc_clean/tests/test_mainfuel_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
-from model_data.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
+from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
+from etl.epc_clean.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases


 class TestMainHeatControlAttributes:
--- a/etl/epc_clean/tests/test_mainheat_attributes.py
+++ b/etl/epc_clean/tests/test_mainheat_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
-from model_data.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
+from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
+from etl.epc_clean.tests.test_data.test_mainheat_attributes_cases import mainheat_cases


 class TestMainHeatAttributes:
--- a/etl/epc_clean/tests/test_mainheat_controls_attributes.py
+++ b/etl/epc_clean/tests/test_mainheat_controls_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
-from model_data.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
+from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+from etl.epc_clean.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases


 class TestMainHeatControlAttributes:
--- a/etl/epc_clean/tests/test_roof_attributes.py
+++ b/etl/epc_clean/tests/test_roof_attributes.py
@ -1,7 +1,7 @@
 import pytest
 from pathlib import Path
-from model_data.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
-from model_data.epc_attributes.RoofAttributes import RoofAttributes
+from etl.epc_clean.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes

 # For local testing
 if __file__ == "<input>":
--- a/etl/epc_clean/tests/test_utils.py
+++ b/etl/epc_clean/tests/test_utils.py
@ -1,4 +1,4 @@
-from model_data.utils import is_percentage_or_number, correct_spelling
+from etl.epc_clean.utils import is_percentage_or_number, correct_spelling


 class TestUtils:
--- a/etl/epc_clean/tests/test_wall_attributes.py
+++ b/etl/epc_clean/tests/test_wall_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.WallAttributes import WallAttributes
-from model_data.tests.test_data.test_wall_attributes_cases import wall_cases
+from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
+from etl.epc_clean.tests.test_data.test_wall_attributes_cases import wall_cases


 class TestWallAttributes:
--- a/etl/epc_clean/tests/test_window_attributes.py
+++ b/etl/epc_clean/tests/test_window_attributes.py
@ -1,6 +1,6 @@
 import pytest
-from model_data.epc_attributes.WindowAttributes import WindowAttributes
-from model_data.tests.test_data.test_window_attributes_cases import windows_cases
+from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
+from etl.epc_clean.tests.test_data.test_window_attributes_cases import windows_cases


 class TestWindowAttributes:
--- a/etl/epc_clean/utils.py
+++ b/etl/epc_clean/utils.py
--- a/etl/land_registry/LandRegistryClient.py
+++ b/etl/land_registry/LandRegistryClient.py
--- a/model_data/simulation_system/core/init.py
+++ b/model_data/simulation_system/core/init.py
--- a/etl/land_registry/app.py
+++ b/etl/land_registry/app.py
--- a/etl/land_registry/sample_addresses.pkl
+++ b/etl/land_registry/sample_addresses.pkl
--- a/etl/land_registry/tests/test_land_registry_client.py
+++ b/etl/land_registry/tests/test_land_registry_client.py
@ -1,6 +1,6 @@
 import pandas as pd
 from unittest.mock import patch, call
-from model_data.LandRegistryClient import LandRegistryClient
+from etl.land_registry.LandRegistryClient import LandRegistryClient


 class TestLandRegistryClient:
--- a/etl/property_dimensions/init.py
+++ b/etl/property_dimensions/init.py
--- a/etl/property_dimensions/app.py
+++ b/etl/property_dimensions/app.py
@ -0,0 +1,54 @@
+"""
+This is a simple application which estimates some of the basic dimensions of a property based on EPC
+data which we can use as a proxy value if we don't have this information on the EPC
+"""
+import os
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+from etl.epc.settings import EARLIEST_EPC_DATE
+from etl.epc.DataProcessor import DataProcessor
+from BaseUtility import Definitions
+from utils.s3 import save_dataframe_to_s3_parquet
+
+DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
+
+GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"]
+
+BUCKET = os.environ.get("BUCKET", "retrofit-data-dev")
+
+
+def app():
+    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
+
+    for directory in tqdm(directories):
+        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
+        data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
+        data = data[~pd.isnull(data["UPRN"])]
+        data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float)
+
+        data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply(
+            lambda x: DataProcessor.clean_construction_age_band(x)
+        )
+        data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])]
+        data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)]
+        data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])]
+        data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])]
+        data = data[~pd.isnull(data["FLOOR_HEIGHT"])]
+
+        df = (
+            data.groupby(GROUPBY)
+            .agg({"NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean", "FLOOR_HEIGHT": "mean"})
+            .reset_index()
+        )
+
+        local_authority = data["LOCAL_AUTHORITY"].unique()
+        if len(local_authority) > 1:
+            raise Exception("More than one la in data")
+        local_authority = local_authority[0]
+
+        save_dataframe_to_s3_parquet(
+            df=df,
+            bucket_name=BUCKET,
+            file_key=f"property_dimensions/{local_authority}.parquet",
+        )
--- a/etl/spatial/BoreholeClient.py
+++ b/etl/spatial/BoreholeClient.py
@ -56,7 +56,7 @@ class BoreholeClient:

    # EXAMPLE
    # There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
-    # entries in here if possible before we produce any form of comparison between our properties, to infer
+    # entries in here if possible before we produce any form of comparison between our epc, to infer
    # the distance from the property to the nearest borehole

    # Let's take a sample
--- a/conservation_areas/ConservationAreaClient.py
+++ b/conservation_areas/ConservationAreaClient.py
@ -1,12 +1,55 @@
-from enum import Enum
+import boto3
+import os
+import tempfile
 import geopandas as gpd
+import numpy as np
+from enum import Enum
 from shapely.geometry import Point
 from utils.logger import setup_logger
+from utils.s3 import read_io_from_s3
 from datatypes.datatypes import OpenUprnCoordinateData

 logger = setup_logger()


+def read_shapefile_from_s3(bucket_name, s3_file_key):
+    """
+    Read a shapefile from S3 into a GeoDataFrame.
+
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_key: The file path of the shape file
+    :return: GeoDataFrame containing the shapefile data
+    """
+
+    s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
+    shape_file_key = s3_file_key.split("/")[-1]
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        s3_client = boto3.client('s3')
+
+        # Ensure the temporary directory exists
+        logger.info("Creating temporary directory at %s" % tmpdirname)
+        os.makedirs(tmpdirname, exist_ok=True)
+
+        # List all files in the given S3 folder
+        s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
+
+        # Download each file to the temporary directory
+        for s3_object in s3_objects:
+            file_key = s3_object['Key']
+            file_name = os.path.basename(file_key)
+            local_file_path = os.path.join(tmpdirname, file_name)
+            # Explicitly create the temporary file
+            with open(local_file_path, 'wb') as tmpfile:
+                s3_client.download_fileobj(bucket_name, file_key, tmpfile)
+
+        # Read the shapefile from the temporary directory into a GeoDataFrame
+        shapefile_path = os.path.join(tmpdirname, shape_file_key)
+        gdf = gpd.read_file(shapefile_path)
+
+    return gdf
+
+
 class ConservationAreaClient:
    """
    Class to interact and manupulate convervation area data. The historic england data
@ -18,13 +61,14 @@ class ConservationAreaClient:
    """

    SOURCES = ["historic_england"]
-    IN_CONSERVATION_AREA = "in_conservation_area"
-    NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
-    UNKNOWN = "unknown"
+    IN_CONSERVATION_AREA = True
+    NOT_IN_CONSERVATION_AREA = False
+    UNKNOWN = None

-    def __init__(self, historic_england_path, gov_path):
+    def __init__(self, historic_england_path, gov_path, bucket):
        self.historic_england_path = historic_england_path
        self.gov_path = gov_path
+        self.bucket = bucket

        self.historic_england_data = None
        self.gov_data = None
@ -34,11 +78,21 @@ class ConservationAreaClient:
        Read the data
        """
        logger.info("Reading in historic england conservation area shapefile")
-        self.historic_england_data = gpd.read_file(self.historic_england_path)
+        self.historic_england_data = read_shapefile_from_s3(
+            bucket_name=self.bucket, s3_file_key=self.historic_england_path
+        )

        logger.info("Reading in Govenment conservation area geojson")
-        self.gov_data = gpd.read_file(self.gov_path)
+
+        self.gov_data = gpd.read_file(
+            read_io_from_s3(
+                bucket_name=self.bucket,
+                file_key=self.gov_path
+            )
+        )
        self.gov_data = self.gov_data.drop(columns=["dataset"])
+        # Convert the gov data to british national grid co-ordinates
+        self.gov_data = self.gov_data.to_crs("EPSG:27700")

    def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):

@ -71,6 +125,43 @@ class ConservationAreaClient:
            else:
                return ConservationAreaClient.UNKNOWN

+    def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+
+        joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
+
+        # Identify where we have definitive information (not "unknown")
+        in_conservation_he = ~joined_gdf_he.index_right.isna() & (
+            joined_gdf_he["NAME"] != "No data available for publication by HE"
+        )
+
+        uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
+        # The right index will be missing when we don't have a match so the uprn is not in a conservation
+        # area
+        uprn_not_in_conservation_he = joined_gdf_he.loc[
+            ~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
+            "UPRN"
+        ].unique()
+
+        # For unknowns, check against government data
+        unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
+        unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
+
+        joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
+        uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
+
+        uprn_gdf['conservation_status'] = self.UNKNOWN
+        uprn_gdf.loc[
+            uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
+        ] = self.IN_CONSERVATION_AREA
+        uprn_gdf.loc[
+            uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
+        ] = self.NOT_IN_CONSERVATION_AREA
+        uprn_gdf.loc[
+            uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
+        ] = self.IN_CONSERVATION_AREA
+
+        return uprn_gdf
+
    def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
        """
        Check if a property is in a conservation area
--- a/etl/spatial/OpenUprnClient.py
+++ b/etl/spatial/OpenUprnClient.py
@ -0,0 +1,118 @@
+import os
+from tqdm import tqdm
+import pandas as pd
+import geopandas as gpd
+from utils.logger import setup_logger
+from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
+
+logger = setup_logger()
+
+
+class OpenUprnClient:
+    """
+
+    This client reads in the Open UPRN data from s3 which can be downloaded from here:
+    https://osdatahub.os.uk/downloads/open/OpenUPRN
+
+    This dataset contains a lookup of UPRNs to coordinates.
+
+    Specs for this dataset can be found here:
+    https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
+    """
+
+    def __init__(self, path, bucket, uprns=None):
+        self.path = path
+        self.bucket = bucket
+        self.uprns = [int(x) for x in uprns] if uprns else None
+        self.data = None
+
+        # This will be stored in S3 and will be the complete list of filenames
+        # We'll then use this to determine which file the UPRN's data is contained in
+        self.filenames = None
+
+    def read(self):
+        """
+        This methodology is placeholder, while data sits localls
+        :return:
+        """
+        logger.info("Reading in open uprn data")
+
+        df = pd.read_csv(
+            read_io_from_s3(
+                bucket_name=self.bucket,
+                file_key=self.path
+            )
+        )
+        if self.uprns:
+            df = df[df["UPRN"].isin(self.uprns)]
+
+        self.data = df
+
+    def read_local(self):
+        """
+        For local testing
+        :return:
+        """
+        logger.info("Reading in open uprn data")
+
+        df = pd.read_csv(self.path)
+        if self.uprns:
+            df = df[df["UPRN"].isin(self.uprns)]
+
+        self.data = df
+
+    def create_file_partitions(self, partition_size=50000):
+        logger.info("Sorting data by UPRN ascending")
+        self.data = self.data.sort_values("UPRN", ascending=True)
+
+        logger.info("Creating partitions")
+        self.data['partition'] = self.data.index // partition_size
+
+        self.filenames = {}
+        for partition, group in tqdm(self.data.groupby('partition')):
+            min_uprn = group['UPRN'].min()
+            max_uprn = group['UPRN'].max()
+            self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
+
+        self.data['filename'] = self.data['partition'].map(self.filenames)
+
+    @staticmethod
+    def find_filename_for_uprn(uprn, filenames):
+        for filename in filenames:
+            min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
+            if min_uprn <= uprn <= max_uprn:
+                return filename
+        return None
+
+    @staticmethod
+    def convert_bng_data_to_gpd(df):
+
+        gpd_data = gpd.GeoDataFrame(
+            df,
+            geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
+            crs="EPSG:27700"  # British National Grid
+        )
+
+        return gpd_data
+
+    def save_filenames_to_s3(self, bucket_name):
+        """
+        Save the filenames to s3
+        :param bucket_name:
+        :return:
+        """
+        file_key = os.path.join("spatial", "filename_meta.parquet")
+
+        filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
+        filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
+            '(\d+)_(\d+)'
+        )
+        filenames['lower'] = filenames['lower'].astype(int)
+        filenames['upper'] = filenames['upper'].astype(int)
+
+        logger.info("Saving filenames to s3 at {}".format(file_key))
+        save_dataframe_to_s3_parquet(
+            df=filenames,
+            file_key=file_key,
+            bucket_name=bucket_name
+        )
--- a/etl/spatial/README.md
+++ b/etl/spatial/README.md
@ -0,0 +1,48 @@
+# Spatial - Geospatial Data Processing Service
+
+## Overview
+
+The Spatial service is designed to read, process, and analyze geospatial data related to
+conservation areas and special buildings. It uses datasets from Historic England and the
+UK government to determine whether a given UPRN (Unique Property Reference Number) is within
+a conservation area or is a listed building. The processed data is saved back to an S3 bucket
+in a parquet format for easy retrieval and further analysis.
+
+## Dependencies
+
+Dependencies are listed in requirements.txt. To install them, run:
+
+```
+pip install -r requirements.txt
+```
+
+## Data Sources
+
+1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
+2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
+3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
+4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
+5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
+
+## Files
+
+- app.py: Main application file that orchestrates the data processing flow.
+- ConservationAreaClient.py: Handles reading and processing of conservation area data.
+- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
+- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
+- requirements.txt: Lists all Python package dependencies.
+
+## How to Run
+
+1. Make sure you have all the required packages installed.
+2. Update the S3 bucket and file path constants in app.py.
+3. Run app.py.
+
+## Workflow
+
+1. Read the datasets for conservation areas and special buildings.
+2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
+3. For each partition:
+    - Convert UPRN data to geopandas DataFrame.
+    - Check if each UPRN is within a conservation area or is a special building.
+    - Save the processed data back to S3 in parquet format.
--- a/etl/spatial/SpecialBuildingsClient.py
+++ b/etl/spatial/SpecialBuildingsClient.py
@ -0,0 +1,114 @@
+import geopandas as gpd
+from shapely.geometry import Point
+from utils.logger import setup_logger
+from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
+from datatypes.datatypes import OpenUprnCoordinateData
+
+logger = setup_logger()
+
+
+class SpecialBuildingsClient:
+    """
+    This class reads in data from Historic England, which can be used to determine if specific buildings are
+    listed or heritage buildings
+    """
+
+    def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
+        self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
+        self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
+        self.bucket = bucket
+
+        self.historic_england_listed_buildings = None
+        self.historic_england_heritage_buildings = None
+
+    def read(self):
+        """
+        Read the data
+        """
+        logger.info("Reading in historic england listed buildings shapefile")
+        self.historic_england_listed_buildings = read_shapefile_from_s3(
+            bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
+        )
+
+        logger.info("Reading in historic england heritage buildings shapefile")
+        self.historic_england_heritage_buildings = read_shapefile_from_s3(
+            bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
+        )
+
+        # Convert the gov data to british national grid co-ordinates
+        self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
+
+    def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
+        """
+        Check if a location specified by British National Grid coordinates is a listed building.
+
+        :param coordinates: dictionary, which should have the OpenUprnCoordinateData format
+        :return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
+        """
+        # Convert the coordinates to a Shapely Point object
+        point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
+
+        # Check if the point is within any of the listed building polygons
+        within_listed_buildings = self.historic_england_listed_buildings.contains(point)
+
+        if within_listed_buildings.any():
+            # If the point is within any listed building polygon, log the names of the buildings and return
+            # "listed_building"
+            names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
+            logger.info(f"The location is within the following listed buildings: {names.values}")
+            return True
+
+        # If the point is not within any listed building polygon, return "not_listed_building"
+        return False
+
+    def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+        # Check against historic England listed buildings data
+        joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
+
+        # Identify where we have matches
+        uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
+
+        # Populate the results in the input GeoDataFrame
+        uprn_gdf['is_listed_building'] = False
+        uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
+
+        return uprn_gdf
+
+    def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
+        """
+        Check if a location specified by British National Grid coordinates is a heritage building at risk.
+
+        :param coordinates: dictionary, which should have the OpenUprnCoordinateData format
+        :return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
+                 "not_heritage_building_at_risk" otherwise
+        """
+        # Convert the coordinates to a Shapely Point object
+        point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
+
+        # Check if the point is within any of the heritage building at risk polygons
+        within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
+
+        if within_heritage_buildings_at_risk.any():
+            # If the point is within any heritage building at risk polygon, log the names of the buildings and return
+            # "heritage_building_at_risk"
+            names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
+            logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
+            return True
+
+        # If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
+        return False
+
+    def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+        # Check against historic England heritage buildings data
+        joined_gdf_heritage = gpd.sjoin(
+            uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
+        )
+
+        # Identify where we have matches
+        uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
+
+        # Populate the results in the input GeoDataFrame
+        uprn_gdf['is_heritage_building'] = False
+        uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
+
+        return uprn_gdf
--- a/etl/spatial/init.py
+++ b/etl/spatial/init.py
--- a/etl/spatial/app.py
+++ b/etl/spatial/app.py
@ -0,0 +1,103 @@
+"""
+This application reads in the open uprn data from a static location and loads it into
+our database for querying from other services
+"""
+
+import os
+from tqdm import tqdm
+import pandas as pd
+from etl.spatial.ConservationAreaClient import ConservationAreaClient
+from etl.spatial.OpenUprnClient import OpenUprnClient
+from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
+from utils.logger import setup_logger
+from utils.s3 import save_dataframe_to_s3_parquet
+
+BUCKET = "retrofit-datalake-dev"
+OUTPUT_BUCKET = "retrofit-data-dev"
+HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
+GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
+OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
+HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
+                                             "NHLE)/Listed_Building_polygons.shp"
+HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
+    "spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
+
+logger = setup_logger()
+
+
+def app():
+    """
+    This application uses the conservation area datasets to determine if a UPRN is
+    in a conservation area or now
+
+    We use two sources of data for determining if homes are in conservation areas.
+    The first is the Historic England dataset, which is a shapefile containing
+    polygons of conservation areas. The second is the gov.uk dataset, which is a
+    geojson file containing polygons of conservation areas.
+
+    The Historic England dataset can be found here:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The listed building dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The hertitige buildings dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The Gov.uk dataset can be found here:
+    https://www.planning.data.gov.uk/dataset/conservation-area
+
+    The open UPRN data can be found here:
+    https://osdatahub.os.uk/downloads/open/OpenUPRN
+
+    The Office for National Statistics Postcode Lookup can be found here:
+    https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
+
+    For the moment, these data sources are downloaded manually and uploaded to S3.
+    This application then processes those files and writes the results to s3
+    """
+
+    conservation_area_client = ConservationAreaClient(
+        historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
+        gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
+        bucket=BUCKET
+    )
+    conservation_area_client.read()
+
+    special_buildings_client = SpecialBuildingsClient(
+        historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
+        historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
+        bucket=BUCKET
+    )
+    special_buildings_client.read()
+
+    open_uprn_client = OpenUprnClient(
+        path=OPEN_UPRN_PATHNAME,
+        bucket=BUCKET
+    )
+    open_uprn_client.read()
+
+    # We want to sort the data and split it into filenames on UPRN.
+    # We'll split the data into chunks of 50,000
+    open_uprn_client.create_file_partitions()
+
+    logger.info("Extracting spatial data for uprn partitions")
+    to_loop_over = open_uprn_client.data.groupby("filename")
+
+    for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
+        uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
+
+        uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
+        uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
+        uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
+
+        # Convert back to a regular dataframe
+        uprn_gdf = uprn_gdf.drop(columns=["geometry"])
+        uprn_gdf = pd.DataFrame(uprn_gdf)
+
+        save_dataframe_to_s3_parquet(
+            df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
+        )
+
+    # We finally save the filesnames to s3
+    open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)
--- a/conservation_areas/requirements.txt
+++ b/conservation_areas/requirements.txt
--- a/etl/spatial/tests/test_borehole_client.py
+++ b/etl/spatial/tests/test_borehole_client.py
@ -1,5 +1,5 @@
 import pytest
-from model_data.BoreholeClient import BoreholeClient
+from etl.spatial.BoreholeClient import BoreholeClient


@pytest.fixture
--- a/etl/wall_area/init.py
+++ b/etl/wall_area/init.py
--- a/model_data/simulation_system/area_data.py
+++ b/model_data/simulation_system/area_data.py
@ -1,5 +1,5 @@
 """
-This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
+This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
 of insulation measures within homes
 """
 import os
--- a/model_data/simulation_system/requirements/area_data.txt
+++ b/model_data/simulation_system/requirements/area_data.txt
--- a/infrastructure/terraform/main.tf
+++ b/infrastructure/terraform/main.tf
@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
  publicly_accessible = true
 }

-# Set up the bucket that recieve the csv uploads of properties to be retrofit
+# Set up the bucket that recieve the csv uploads of epc to be retrofit
 module "s3_presignable_bucket" {
  source          = "./modules/s3_presignable_bucket"
  bucketname      = "retrofit-plan-inputs-${var.stage}"
--- a/input_property_list.csv
+++ b/input_property_list.csv
@ -0,0 +1,12 @@
+address,postcode,Notes,,,,
+28 Distillery Wharf,W6 9bf,,,,,
+Flat 14 Godley V C House,E2 0LP,,,,,
+49 Elderfield Road,E5 0LF,,,,,
+26 Stanhope Road,N6 5NG,,,,,
+Flat 3 Frederick Building,N1 4BD,,,,,
+Flat 4 Frederick Building,N1 4BD,,,,,
+"Flat 28, 22 Adelina Grove",E1 3BX,,,,,
+"Flat 39, 239 Long Lane",SE1 4PT,,,,,
+"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
+"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
+88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
--- a/model_data/README.md
+++ b/model_data/README.md
@ -1,49 +0,0 @@
-# Environment setup
-
-We're using conda to manage environments to circumvent the
-issues with Mac M1. This documentation will also cover Pycharm setup.
-
-We're working in python 3.10 so
-
-```commandline
-conda create -n hestia-data python=3.10
-```
-
-Then activate the environment
-
-```commandline
-conda activate hestia-data
-```
-
-To set up with Pycharm, run
-
-```commandline
-which python
-```    
-
-and grab the path to the python executable. Then in Pycharm, go to
-Settings > Project > Python Interpreter and click the gear icon
-to add a new interpreter. Select Conda and either paste the path to the python executable
-and click OK, or select the conda environment from the dropdown.
-
-You may need to restart Pycharm for the new interpreter to be recognised.
-
-To install project dependencies navigate to /model_data and run
-
-```commandline
-pip install -r requirements.txt
-```
-
-### Running Tests
-
-If you are not in a virtual environment, activate it with
-
-```commandline
-conda activate envName
-```
-
-Then run
-
-```commandline
-pytest --cov-config=model_data/.coveragerc --cov=model_data
-```
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -1,650 +0,0 @@
-import numpy as np
-import pandas as pd
-import statsmodels.api as sm
-import matplotlib.pyplot as plt
-from typing import Dict, Optional, List
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
-    median_absolute_error, mean_absolute_percentage_error
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.inspection import permutation_importance
-from model_data.EpcClean import EpcClean
-
-from statsmodels.stats.outliers_influence import variance_inflation_factor
-from tqdm import tqdm
-from utils.logger import setup_logger
-
-logger = setup_logger()
-
-
-class SapModel:
-    # We want to estimate for making improvements on different property components
-    RESPONSE = "current-energy-efficiency"
-    # We could potentially  build models by constituency to avoid having too many
-    # features in the model
-    BASE_FEATURES = [
-        "property-type",
-        "built-form",
-        "construction-age-band",
-        "number-habitable-rooms",
-        "constituency",
-        "number-heated-rooms",
-        "transaction-type"
-    ]
-
-    COMPONENT_FEATURES = [
-        "walls-description",
-        "floor-description",
-        "lighting-description",
-        "roof-description",
-        "mainheat-description",
-        "hotwater-description",
-        "main-fuel",
-        "mechanical-ventilation",
-        "secondheat-description",
-        "energy-tariff",
-        "solar-water-heating-flag",
-        "photo-supply",
-        "windows-description",
-        "glazed-type",
-        "glazed-area",
-        "multi-glaze-proportion",
-        # "lighting-description"  # Might not need to use this
-        "low-energy-lighting",
-        "number-open-fireplaces",
-        "mainheatcont-description",
-        "fixed-lighting-outlets-count",
-        "floor-height",
-        "floor-level",
-        "total-floor-area",
-        "extension-count",
-    ]
-
-    CATEGORICAL_COLS = [
-        "property-type",
-        "built-form",
-        "number-habitable-rooms",
-        "constituency",
-        "number-heated-rooms",
-        "mainheat-description",
-        "hotwater-description",
-        "main-fuel",
-        "mechanical-ventilation",
-        "secondheat-description",
-        "energy-tariff",
-        "solar-water-heating-flag",
-        "windows-description",
-        "glazed-type",
-        "glazed-area",
-        "construction-age-band",
-        "lighting-description",
-        "mainheatcont-description",
-        "floor-level",
-    ]
-
-    NUMERICAL_COLUMNS = [
-        "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
-        "fixed-lighting-outlets-count",
-        "floor-height",
-        "total-floor-area",
-        "extension-count",
-    ]
-
-    # For the moment, we store records of the best performing models as a benchmark for future imporvements
-    BEST_FIT = {
-        'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
-        'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
-        'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197
-    }
-
-    BEST_PREDICT = {
-        'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
-        'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
-        'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
-    }
-
-    BEST_FINAL = {
-        'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
-        'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
-        'Median Absolute Error': 1.9487883489495985
-    }
-
-    BUCKET_VARIABLES = [
-        "number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
-    ]
-
-    def __init__(
-        self, data: List[Dict],
-        cleaner: EpcClean,
-        test_size: Optional[float] = 0.2,
-        random_state: Optional[int] = None
-    ):
-        self.df = pd.DataFrame(data)
-        self.cleaner = cleaner
-        self.random_state = random_state if random_state is not None else 42
-        self.test_size = 0.2 if test_size is None else test_size
-
-        self.model_data = None
-        self.train_x = None
-        self.train_y = None
-        self.test_x = None
-        self.test_y = None
-
-        self.test_model = None
-        self.final_model = None
-
-        self.fit_error = None
-        self.predict_error = None
-        self.final_error = None
-        self.worst = {
-            "fit_errors": pd.DataFrame(),
-            "prediction_errors": pd.DataFrame(),
-            "fit_x": pd.DataFrame(),
-            "prediction_x": pd.DataFrame(),
-            "final_errors": pd.DataFrame(),
-            "final_x": pd.DataFrame(),
-        }
-
-        self.fit_df = None
-        self.predict_df = None
-        self.final_fit_df = None
-        self.diagnosis = {}
-
-    def run(self, plot: bool = False) -> None:
-        """
-        A pipeline method to run all necessary methods in correct order.
-        :param plot: Boolean to indicate whether to plot the regression
-        """
-        try:
-            self.create_dataset()
-            self.fit_model()
-            if plot:
-                self.plot_regression(self.fit_df)
-        except Exception as e:
-            logger.error("An error occurred during execution.")
-            logger.error(str(e))
-
-    def _merge_with_u_values(
-        self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
-    ) -> pd.DataFrame:
-
-        """
-        Utility function to merge u value data with model data
-        :param model_data: Pandas dataframe which is the main modelling dataset
-        :param description: Name of the description column for which we're merging u-values onto
-        :param thermal_transmittance: Name of the thermal transmittance column
-        :return:
-        """
-
-        u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
-            ["original_description", thermal_transmittance]].rename(
-            columns={thermal_transmittance: f"{description}_u_value"}
-        )
-
-        model_data = model_data.merge(
-            u_values,
-            how="left",
-            left_on=f"{description}-description",
-            right_on="original_description"
-        ).drop(columns=["original_description"])
-
-        return model_data
-
-    def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
-        """
-        Appends cleaned data into the model data.
-        :param model_data: Original model data.
-        :return: Model data with cleaned data appended.
-        """
-        for description in ["walls", "floor", "roof"]:
-            model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
-
-        # lighting_proportions added separately as it doesn't use the _merge_with_u_values method
-        lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
-            ["original_description", "low_energy_proportion"]]
-
-        model_data = model_data.merge(
-            lighting_proportions,
-            how="left",
-            left_on="lighting-description",
-            right_on="original_description"
-        ).drop(columns=["original_description"])
-
-        return model_data
-
-    @staticmethod
-    def _convert_transaction_type(model_data: pd.DataFrame) -> pd.DataFrame:
-        """
-        Converts transaction type to boolean
-        :param model_data: Model data with transaction type.
-        :return: Model data with converted transaction type.
-        """
-        model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
-        model_data = model_data.drop(columns=["transaction-type"])
-        return model_data
-
-    @staticmethod
-    def bucket_and_fill(df: pd.DataFrame, column_name: str, n_bins: int = 10) -> pd.DataFrame:
-        """
-        Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD"
-        :param df: Dataframe of features to be binned
-        :param column_name: Name of the column to be binned
-        :param n_bins: Number of bins to use
-        :return: Dataframe with binned column
-        """
-        # Check if the column is numerical
-        if np.issubdtype(df[column_name].dtype, np.number):
-            # Create a new categorical column from numerical one by binning the data
-            df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
-            # Replace missing data with "NO_RECORD"
-            df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
-            df[column_name + "_bucket"] = np.where(
-                df[column_name + "_bucket"] == "nan",
-                "NO_RECORD",
-                df[column_name + "_bucket"]
-            )
-        return df
-
-    def _clean_numericals(self, model_data):
-
-        # Try binning numericals
-        remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES]
-
-        for col in self.BUCKET_VARIABLES:
-            model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
-            # If all values are missing, set all values to 0 - this column will get dropped
-            if all(pd.isnull(model_data[col])):
-                model_data[col + "_bucket"] = "NO_RECORD"
-                continue
-            model_data = self.bucket_and_fill(model_data, col)
-
-        # Replace the data with the binned version
-        model_data = model_data.drop(columns=self.BUCKET_VARIABLES)
-        model_data = model_data.rename(
-            columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES))
-        )
-
-        # Basic fill the rest of the columns with 0 - currenrtly this provided the best performance
-        for col in remaining_numericals:
-            model_data[col] = np.where(
-                model_data[col] == "", "0", model_data[col]
-            ).astype(float)
-
-        return model_data
-
-    @staticmethod
-    def clean_missings(model_data: pd.DataFrame) -> pd.DataFrame:
-        """
-        Fills categorical missing data with sensible values
-        :param model_data: Original model data.
-        :return: Model data with cleaned categorical data.
-        """
-
-        # Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
-        # potentially
-        # a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
-
-        model_data["mechanical-ventilation"] = np.where(
-            model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
-        )
-
-        model_data["solar-water-heating-flag"] = np.where(
-            model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
-        )
-
-        model_data["glazed-type"] = np.where(
-            model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
-        )
-
-        model_data["glazed-area"] = np.where(
-            model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
-        )
-
-        return model_data
-
-    def create_dataset(self):
-        logger.info("Creating modelling dataset")
-        model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
-        model_data = model_data.reset_index(drop=True)
-        model_data["idx"] = model_data.index.copy()
-
-        # Append on u-values
-        model_data = self._append_cleaned_data(model_data)
-
-        model_data = self.clean_missings(model_data)
-
-        # Convert transaction_type
-        model_data = self._convert_transaction_type(model_data)
-
-        # Clean numerical columns
-        model_data = self._clean_numericals(model_data)
-
-        # Take just entries with U-values
-        # TODO: Rather than doing this, do we want to include the estimated u-values?
-        #       Since this ends up with just 2k entries
-        model_data = model_data[
-            ~pd.isnull(model_data["walls_u_value"]) &
-            ~pd.isnull(model_data["floor_u_value"]) &
-            ~pd.isnull(model_data["roof_u_value"])
-            ]
-
-        exclude_features = [
-            "walls-description", "floor-description", "roof-description", "transaction-type"
-        ]
-
-        features = [
-            x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
-                "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
-            ] if x not in exclude_features
-        ]
-
-        model_data = model_data[features]
-
-        for col in self.CATEGORICAL_COLS:
-            model_data[col] = model_data[col].astype('category')
-
-        # Convert response
-        model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
-
-        self.model_data = model_data
-
-    def make_training_test(self, x):
-        # Split into training and test
-        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
-            x.drop(self.RESPONSE, axis=1),
-            x[self.RESPONSE],
-            test_size=self.test_size,
-            random_state=self.random_state
-        )
-
-    @staticmethod
-    def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
-        """
-        Utility function to remove columns that have zero standard deviation from both test and train sets
-        :param train_x: Training data to remove columns from
-        :param test_x: If provided, remove the same columns from the test data
-        :param threshold: float value, if the standard deviation is below this threshold, the column is considered
-                             to have zero standard deviation
-        :return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
-        """
-        # Compute standard deviations
-        std_devs = train_x.std()
-
-        # Find columns with zero or near-zero standard deviation
-        zero_std_cols = std_devs[std_devs <= threshold].index
-
-        # Drop these columns from the training data
-        train_x = train_x.drop(zero_std_cols, axis=1)
-
-        if test_x is not None:
-            # Ensure the test data has the same columns
-            test_x = test_x[train_x.columns]
-            return train_x, test_x
-
-        return train_x, None
-
-    def fit_model(self):
-        """
-        Main function to fit the model and produce accuracy metrics
-        """
-
-        x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
-
-        # Convert booleans to integer
-        for col in x.columns:
-            if x[col].dtype == bool:
-                x[col] = x[col].astype(int)
-
-            if x[col].dtype == object:
-                x[col] = x[col].astype(float)
-
-        # Create the training and test sets for each run
-        self.make_training_test(x)
-        self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
-        logger.info("Detecting multi-collinearity in training dataset")
-        self.detect_multi_collinearity()
-
-        # Add a constant to the independent value
-        train_x = sm.add_constant(self.train_x)
-        test_x = sm.add_constant(self.test_x)
-        train_idx = train_x["idx"].copy()
-        test_idx = self.test_x["idx"].copy()
-        train_x = train_x.drop(columns=["idx"])
-        test_x = test_x.drop(columns=["idx"])
-
-        logger.info("Fitting testing model")
-        # make regression model
-        model = sm.OLS(self.train_y, train_x)
-        # fit model and print results
-        self.test_model = model.fit()
-
-        train_predictions = self.test_model.fittedvalues
-        test_predictions = self.test_model.predict(test_x)
-
-        self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
-            y_true=self.train_y, y_pred=train_predictions
-        )
-
-        # Predict on new data
-        self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
-            y_true=self.test_y, y_pred=test_predictions
-        )
-
-        fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
-        predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
-
-        self.model_data['fit'] = self.test_model.fittedvalues
-        # The worst errors over index heavily for flats
-        self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
-        self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]
-
-        self.fit_df = pd.DataFrame(
-            {
-                "fit": train_predictions,
-                "actual": self.train_y,
-                "idx": train_idx
-            }
-        ).sort_values("actual", ascending=True)
-
-        self.predict_df = pd.DataFrame(
-            {
-                "predictions": test_predictions,
-                "actual": self.test_y,
-                "idx": test_idx
-            }
-        )
-
-        self.diagnosis = {
-            "fit_success": fit_success,
-            "predict_success": predict_success,
-            "summary": self.test_model.summary()
-        }
-
-        # We're now ready to fit the final model
-        # For the momeent, the pre-processing at the top of this function merely removes columns, so we
-        # just need to remove the columns that were removed from the training data from the final model
-        logger.info("Fitting final model")
-        x = sm.add_constant(x)
-        y = x[self.RESPONSE]
-        x = x[self.train_x.columns]
-        idx = x["idx"].copy()
-        x = x.drop(columns=["idx"])
-
-        final_model = sm.OLS(y, x)
-        # fit model and print results
-        self.final_model = final_model.fit()
-        final_predictions = self.final_model.fittedvalues
-
-        self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
-            y_true=y, y_pred=final_predictions
-        )
-
-        self.final_fit_df = pd.DataFrame(
-            {
-                "fit": final_predictions,
-                "actual": y,
-                "idx": idx
-            }
-        ).sort_values("actual", ascending=True)
-
-    @staticmethod
-    def check_successes(experiment_error, best_error):
-        """
-        Simple function to check if the experiment error is better than the best error
-        :param experiment_error:    output of calculate_regression_metrics() on the experiment
-        :param best_error:          Current benchmark best error
-        :return:
-        """
-
-        successes = []
-        for k in experiment_error:
-            if k in ["Explained Variance Score", "R2 Score"]:
-                # We want to maximise this so we want experiment error to be higher
-                successes.append(
-                    {
-                        "measure": k,
-                        "success": experiment_error[k] >= best_error[k],
-                        "difference": abs(experiment_error[k] - best_error[k])
-                    }
-                )
-                continue
-            successes.append(
-                {
-                    "measure": k,
-                    "success": experiment_error[k] <= best_error[k],
-                    "difference": abs(experiment_error[k] - best_error[k])
-                }
-            )
-
-        return pd.DataFrame(successes)
-
-    def rf_importance(self, train_x, train_y, test_x, test_y):
-        """
-        Utility function to estimate feature importance using a random forest
-        This is useful to get a sense of some of the key features which are driving model
-        performance
-
-        :param train_x: Training data covariates to build the importance model on
-        :param train_y: Training data response to build the importance model on
-        :param test_x:  Test data covariates to build the permutation importance model on
-        :param test_y:  Test data response to build the permutation importance model on
-        :return: Pandas dataframe of feature importances, ranked by most important to least
-        """
-
-        rf = RandomForestRegressor(random_state=self.random_state)
-        rf.fit(train_x, train_y)
-
-        # Print the name and importance of each feature
-        rf_importance_df = []
-        for feature, importance in zip(train_x.columns, rf.feature_importances_):
-            rf_importance_df.append(
-                {
-                    "Feature": feature,
-                    "rf_importance": importance
-                }
-            )
-        rf_importance_df = pd.DataFrame(rf_importance_df)
-        rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False)
-
-        perm_importance = self.permuation_importance(rf, test_x, test_y)
-
-        return rf_importance_df, perm_importance
-
-    @staticmethod
-    def permuation_importance(rf, test_x, test_y):
-        """
-        Simple utility function to produce permutation importance for a given model\
-        :param rf: Random forest model to calculate permutation importance for
-        :param test_x: Test covariates to be used for permutation importance
-        :param test_y: Test response to be used for permutation importance
-        :return:
-        """
-        perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error')
-        perm_importance_df = pd.DataFrame(
-            {
-                "Feature": test_x.columns,
-                "perm_importance": perm_importance.importances_mean
-            }
-        ).sort_values(by="perm_importance", ascending=False)
-
-        return perm_importance_df
-
-    def detect_multi_collinearity(self):
-        # Get the VIFs for each variable
-        vifs = pd.DataFrame()
-        vifs["features"] = self.train_x.columns
-        vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))]
-
-        # Get the features with the highest VIF
-        vifs = vifs.sort_values("vif", ascending=False)
-
-        # There are some features, we do not want to remove
-        required_features = [
-            "walls_u_value", "floor_u_value", "roof_u_value", "idx", "is_rdsap"
-        ]
-
-        vifs = vifs[~vifs["features"].isin(required_features)]
-        drop_vifs = vifs[np.isinf(vifs["vif"])]
-
-        # Acceptable drop variables:
-        # main-fuel_Gas: mains gas
-        # glazed-type_NO DATA!
-        # glazed-area_NO DATA!
-
-        self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
-        self.test_x = self.test_x[self.train_x.columns]
-
-    @staticmethod
-    def plot_regression(df):
-        # Extract the "fit" and "actual" columns from the dataframe
-        fit = df['fit']
-        actual = df['actual']
-
-        # Create an array of x-values (assumed to be sequential integers)
-        x = np.arange(len(df))
-
-        # Plot the fit and actual data
-        plt.plot(x, fit, color='red', label='Fit')
-        plt.plot(x, actual, color='blue', label='Actual')
-
-        # Set labels and title
-        plt.xlabel('Index')
-        plt.ylabel('Value')
-        plt.title('Linear Regression - Fit vs Actual')
-
-        # Display legend
-        plt.legend()
-
-        # Show the plot
-        plt.show()
-
-    @staticmethod
-    def calculate_regression_metrics(y_true, y_pred, n=20):
-        """
-        Calculate the 5 most important accuracy metrics for regression.
-
-        Args:
-            y_true (array-like): Array of true target values.
-            y_pred (array-like): Array of predicted target values.
-
-        Returns:
-            dict: Dictionary containing the calculated metrics.
-        """
-        metrics = {
-            'MAPE': mean_absolute_percentage_error(y_true, y_pred),
-            'Mean Squared Error': mean_squared_error(y_true, y_pred),
-            'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
-            'R2 Score': r2_score(y_true, y_pred),
-            'Explained Variance Score': explained_variance_score(y_true, y_pred),
-            'Median Absolute Error': median_absolute_error(y_true, y_pred)
-        }
-
-        errors = pd.DataFrame()
-        errors['Fit'] = y_true
-        errors['Actual'] = y_pred
-        errors['Residual'] = errors['Actual'] - errors['Fit']
-        errors['Absolute Residual'] = np.abs(errors['Residual'])
-
-        worst_errors = errors.nlargest(n, 'Absolute Residual')
-
-        return metrics, worst_errors
--- a/model_data/analysis/UvalueEstimations.py
+++ b/model_data/analysis/UvalueEstimations.py
@ -1,207 +0,0 @@
-import pickle
-import pandas as pd
-import numpy as np
-from model_data.EpcClean import EpcClean
-
-
-class UvalueEstimations:
-    def __init__(self, data: list):
-        """
-        Initialize the UvalueEstimations class.
-
-        :param data: The input data as a list of dictionaries, to be converted to a dataframe
-        """
-        self.data = pd.DataFrame(data)
-        self.walls = None
-        self.walls_decile_data = {}
-        self.roofs = None
-        self.floors = None
-        self.floors_decile_data = {}
-
-    def get_estimates(self, cleaner: EpcClean):
-        """
-        Calculate U-value estimates for walls, roofs, and floors.
-
-        :param cleaner: An instance of the EpcClean class used for cleaning data.
-        """
-        self.set_walls(cleaner)
-        self.set_roofs(cleaner)
-        self.set_floors(cleaner)
-
-    def set_walls(self, cleaner: EpcClean):
-        """
-        Set U-value estimates for walls.
-
-        :param cleaner: An instance of the EpcClean class used for cleaning data.
-        """
-        walls_columns = [
-            "local-authority", "property-type", "walls-description", "walls-energy-eff", "walls-env-eff", "built-form",
-            "total-floor-area", "number-habitable-rooms", "number-heated-rooms"
-        ]
-
-        walls_df = self.data[self.data["walls-description"].str.contains("Average thermal transmittance")]
-
-        # Take just the columns we want
-        walls_df = walls_df[walls_columns]
-        walls_df["total-floor-area"] = walls_df["total-floor-area"].astype(float)
-
-        walls_df, decile_labels, decile_boundaries = self.classify_into_deciles(walls_df, "total-floor-area")
-
-        # We now get the U-values
-        walls_df = walls_df.merge(
-            pd.DataFrame(cleaner.cleaned['walls-description'])[["original_description", "thermal_transmittance"]],
-            how="left",
-            right_on="original_description",
-            left_on="walls-description"
-        )
-
-        u_value_summary = walls_df.groupby(
-            [
-                "local-authority",
-                "property-type",
-                "walls-energy-eff",
-                "walls-env-eff",
-                "built-form",
-                "number-habitable-rooms",
-                "number-heated-rooms",
-                "total-floor-area_group"
-            ],
-            observed=True
-        ).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
-
-        u_value_summary.columns = [
-            "local-authority",
-            "property-type",
-            "walls-energy-eff",
-            "walls-env-eff",
-            "built-form",
-            "number-habitable-rooms",
-            "number-heated-rooms",
-            "total-floor-area_group",
-            "median_thermal_transmittance",
-            "n_samples"
-        ]
-
-        self.walls = u_value_summary
-        self.walls_decile_data = {
-            "decile_labels": decile_labels,
-            "decile_boundaries": decile_boundaries
-        }
-
-    def set_roofs(self, cleaner: EpcClean):
-        """
-        Set U-value estimates for roofs.
-
-        :param cleaner: An instance of the EpcClean class used for cleaning data.
-        """
-        pass
-
-    def set_floors(self, cleaner: EpcClean):
-        """
-        Set U-value estimates for floors.
-
-        :param cleaner: An instance of the EpcClean class used for cleaning data.
-        """
-        floors_columns = [
-            "local-authority", "property-type", "floor-description", "floor-energy-eff", "floor-env-eff",
-            "built-form",
-            "total-floor-area", "number-habitable-rooms", "number-heated-rooms"
-        ]
-
-        floors_df = self.data[self.data["floor-description"].str.contains("Average thermal transmittance")]
-
-        # Take just the columns we want
-        floors_df = floors_df[floors_columns]
-        floors_df["total-floor-area"] = floors_df["total-floor-area"].astype(float)
-
-        floors_df, decile_labels, decile_boundaries = self.classify_into_deciles(floors_df, "total-floor-area")
-
-        # We now get the U-values
-        floors_df = floors_df.merge(
-            pd.DataFrame(cleaner.cleaned['floor-description'])[["original_description", "thermal_transmittance"]],
-            how="left",
-            right_on="original_description",
-            left_on="floor-description"
-        )
-
-        u_value_summary = floors_df.groupby(
-            [
-                "local-authority",
-                "property-type",
-                "floor-energy-eff",
-                "floor-env-eff",
-                "built-form",
-                "number-habitable-rooms",
-                "number-heated-rooms",
-                "total-floor-area_group"
-            ],
-            observed=True
-        ).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
-
-        u_value_summary.columns = [
-            "local-authority",
-            "property-type",
-            "floor-energy-eff",
-            "floor-env-eff",
-            "built-form",
-            "number-habitable-rooms",
-            "number-heated-rooms",
-            "total-floor-area_group",
-            "median_thermal_transmittance",
-            "n_samples"
-        ]
-
-        self.floors = u_value_summary
-        self.floors_decile_data = {
-            "decile_labels": decile_labels,
-            "decile_boundaries": decile_boundaries
-        }
-
-    @staticmethod
-    def classify_into_deciles(df: pd.DataFrame, column: str) -> (pd.DataFrame, list, list):
-        """
-        Break a column in a Pandas DataFrame into deciles and classify new values into the existing deciles.
-
-        :param df: The input Pandas DataFrame.
-        :param column: The column name to break into deciles.
-
-        :return: A tuple containing:
-            - The DataFrame with the decile group column.
-            - The list of decile labels.
-            - The list of decile boundaries.
-        """
-        # Calculate decile boundaries
-        decile_boundaries = np.percentile(df[column], np.arange(0, 101, 10))
-
-        # Create decile labels
-        decile_labels = [f"Decile {i + 1}" for i in range(10)]
-
-        # Assign decile labels to existing values
-        df[column + "_group"] = pd.cut(df[column], bins=decile_boundaries, labels=decile_labels,
-                                       include_lowest=True)
-
-        return df, decile_labels, decile_boundaries
-
-    @staticmethod
-    def classify_decile_newvalues(decile_boundaries, decile_labels, new_values: list) -> list:
-        """
-        Classify new values into existing deciles based on decile definitions.
-
-        :param decile_boundaries: The list of decile boundaries.
-        :param decile_labels: The list of decile labels.
-        :param new_values: A list of new values to classify.
-
-        :return: The classifications for the new values as a list.
-        """
-        # Classify new values based on decile definitions
-        classifications = pd.cut(new_values, bins=decile_boundaries, labels=decile_labels, include_lowest=True)
-        return classifications.tolist()
-
-    def _save(self, filename):
-        """
-        Useful utility function to store this object, which is particularly handy for unit testing
-        :return:
-        """
-        with open(filename, 'wb') as f:
-            pickle.dump(self, f)
-           
--- a/model_data/config.py
+++ b/model_data/config.py
@ -1,6 +0,0 @@
-import os
-from dotenv import load_dotenv
-
-load_dotenv(dotenv_path='model_data/.env')
-
-EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
--- a/model_data/downloader.py
+++ b/model_data/downloader.py
@ -1,29 +0,0 @@
-import time
-
-
-def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdown=0.1):
-    offset_from = 0
-    n_completed = 0
-    results = []
-    complete = False
-    while not complete:
-        if verbose:
-            print("Pulling for page %s" % str(int(offset_from / page_size) + 1))
-        time.sleep(slowdown)
-        search_resp = client.domestic.search(params=params, offset_from=offset_from, size=page_size)
-
-        # Note: We can only make 10k queries for a single set of search queries.
-        # It might make sense to download data via zip for machine learning since we don't need this
-        # data to be perfectly up to date
-        if not search_resp:
-            break
-
-        n_completed += 1
-
-        results.extend(search_resp["rows"])
-        if n_completed == n_pages:
-            complete = True
-        else:
-            offset_from += page_size
-
-    return results
--- a/model_data/plotting/plotting_functions.py
+++ b/model_data/plotting/plotting_functions.py
@ -1,40 +0,0 @@
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-
-def create_heatmap_plots(data, response_var, pivot_var1, pivot_var2, order1=None, order2=None):
-    """
-    Create a heatmap plot based on a list of data and given variables.
-
-    :param data: List of dictionaries, input data.
-    :param response_var: String, response variable to be plotted.
-    :param pivot_var1: String, first pivot variable to be used in the plot.
-    :param pivot_var2: String, second pivot variable to be used in the plot.
-    :param order1: List, the order of categories for pivot_var1. Optional.
-    :param order2: List, the order of categories for pivot_var2. Optional.
-
-    Returns:
-        None. Displays the generated plot.
-    """
-
-    # Create a DataFrame from your list of dictionaries
-    df = pd.DataFrame(data)
-
-    # Convert the response variable column to float type if it's not already
-    df[response_var] = df[response_var].astype(float)
-
-    # Create a pivot table
-    pivot = df.pivot_table(index=pivot_var1, columns=pivot_var2, values=response_var)
-
-    # If an order is provided, reorder the pivot table
-    if order1 is not None:
-        pivot = pivot.reindex(order1)
-    if order2 is not None:
-        pivot = pivot[order2]
-
-    # Plot the heatmap
-    plt.figure(figsize=(10, 6))
-    sns.heatmap(pivot, annot=True, fmt=".2f", cmap='coolwarm')
-    plt.title(f"Heatmap of {response_var} by {pivot_var1} and {pivot_var2}")
-    plt.show()
--- a/Show more
+++ b/Show more