Merge pull request #231 from Hestia-Homes/refactor-data-processor

Refactor data processor
2026-07-27 23:35:01 +00:00 · 2023-09-15 15:40:12 +01:00 · 2023-09-15 15:40:12 +01:00 · 402d71eb77
commit 402d71eb77
parent 98b6261fe1 8894ef7d6c
20 changed files with 592 additions and 5773 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -9,4 +9,5 @@ omit =
    model_data/plotting/*
    recommendations/rdsap_tables.py
    model_data/simulation_system/*
-    model_data/cleaner_app.py
+    model_data/cleaner_app.py
+    backend/app/*
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/backend/app/db/connection.py
+++ b/backend/app/db/connection.py
@ -11,4 +11,4 @@ db_string = connection_string.format(
    dbname=get_settings().DB_NAME,
 )

-db_engine = create_engine(db_string)
+db_engine = create_engine(db_string, pool_size=20, max_overflow=5)
--- a/backend/app/plan/columntypes.py
+++ b/backend/app/plan/columntypes.py
@ -1,32 +0,0 @@
-columntypes = {
-    'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
-    'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
-    'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', 'FLOOR_LEVEL': 'float64',
-    'CONSTRUCTION_AGE_BAND': 'object', 'TRANSACTION_TYPE_STARTING': 'object',
-    'WALLS_DESCRIPTION_STARTING': 'object',
-    'FLOOR_DESCRIPTION_STARTING': 'object', 'LIGHTING_DESCRIPTION_STARTING': 'object',
-    'ROOF_DESCRIPTION_STARTING': 'object', 'MAINHEAT_DESCRIPTION_STARTING': 'object',
-    'HOTWATER_DESCRIPTION_STARTING': 'object', 'MAIN_FUEL_STARTING': 'object',
-    'MECHANICAL_VENTILATION_STARTING': 'object',
-    'SECONDHEAT_DESCRIPTION_STARTING': 'object', 'ENERGY_TARIFF_STARTING': 'object',
-    'SOLAR_WATER_HEATING_FLAG_STARTING': 'object', 'PHOTO_SUPPLY_STARTING': 'float64',
-    'WINDOWS_DESCRIPTION_STARTING': 'object', 'GLAZED_TYPE_STARTING': 'object',
-    'MULTI_GLAZE_PROPORTION_STARTING': 'float64', 'LOW_ENERGY_LIGHTING_STARTING': 'float64',
-    'NUMBER_OPEN_FIREPLACES_STARTING': 'float64', 'MAINHEATCONT_DESCRIPTION_STARTING': 'object',
-    'EXTENSION_COUNT_STARTING': 'float64', 'LODGEMENT_DATE_STARTING': 'object',
-    'TRANSACTION_TYPE_ENDING': 'object',
-    'WALLS_DESCRIPTION_ENDING': 'object', 'FLOOR_DESCRIPTION_ENDING': 'object',
-    'LIGHTING_DESCRIPTION_ENDING': 'object',
-    'ROOF_DESCRIPTION_ENDING': 'object', 'MAINHEAT_DESCRIPTION_ENDING': 'object',
-    'HOTWATER_DESCRIPTION_ENDING': 'object',
-    'MAIN_FUEL_ENDING': 'object', 'MECHANICAL_VENTILATION_ENDING': 'object',
-    'SECONDHEAT_DESCRIPTION_ENDING': 'object',
-    'ENERGY_TARIFF_ENDING': 'object', 'SOLAR_WATER_HEATING_FLAG_ENDING': 'object',
-    'PHOTO_SUPPLY_ENDING': 'float64',
-    'WINDOWS_DESCRIPTION_ENDING': 'object', 'GLAZED_TYPE_ENDING': 'object',
-    'MULTI_GLAZE_PROPORTION_ENDING': 'float64',
-    'LOW_ENERGY_LIGHTING_ENDING': 'float64', 'NUMBER_OPEN_FIREPLACES_ENDING': 'float64',
-    'MAINHEATCONT_DESCRIPTION_ENDING': 'object', 'EXTENSION_COUNT_ENDING': 'float64',
-    'LODGEMENT_DATE_ENDING': 'object',
-    'id': 'object'
-}
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -8,8 +8,10 @@ from backend.app.config import get_settings
 from backend.Property import Property
 from epc_api.client import EpcClient
 from utils.logger import setup_logger
+from utils.s3 import read_from_s3
 from recommendations.FloorRecommendations import FloorRecommendations
 from recommendations.WallRecommendations import WallRecommendations
+from recommendations.config import UPGRADES_MAP
 from utils.uvalue_estimates import classify_decile_newvalues
 from backend.app.db.utils import row2dict
 from starlette.responses import Response
@ -17,6 +19,7 @@ from sqlalchemy.orm import sessionmaker
 from sqlalchemy.exc import IntegrityError, OperationalError
 from datetime import datetime
 import pandas as pd
+import msgpack

 # model apis
 from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
@ -31,21 +34,17 @@ from backend.app.db.functions.recommendations_functions import (
 )
 from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
 from backend.app.db.connection import db_engine
-from backend.app.plan.columntypes import columntypes

 from model_data.optimiser.GainOptimiser import GainOptimiser
 from model_data.optimiser.CostOptimiser import CostOptimiser
-from backend.app.utils import epc_to_sap_lower_bound, save_dataframe_to_s3_parquet, read_parquet_from_s3
+from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
 from model_data.optimiser.optimiser_functions import prepare_input_measures
 from model_data.simulation_system.core.DataProcessor import DataProcessor
-from model_data.simulation_system.core.Settings import (
-    FIXED_FEATURES, COMPONENT_FEATURES, COLUMNS_TO_MERGE_ON
-)
+from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON

 # TODO: This is placeholder until data is stored in DB
 from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
 from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
-from backend.app.plan.temp_cleaned_data import cleaned

 logger = setup_logger()

@ -98,12 +97,6 @@ walls_decile_data = {
                      'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
                                                                      120., 2279.]}

-lighting_averages = [
-    {'lighting-description': 'good lighting efficiency', 'low-energy-lighting': 99.26666666666667},
-    {'lighting-description': 'excellent lighting efficiency', 'low-energy-lighting': 100.0},
-    {'lighting-description': 'below average lighting efficiency', 'low-energy-lighting': 0.0}
-]
-

 def filter_materials(materials):
    materials_by_type = defaultdict(list)
@ -138,18 +131,62 @@ def insert_temp_recommendation_id(property_recommendations):
    return property_recommendations


-def score_measures():
+def get_cleaned():
+    """
+    This function will retrieve the cleaned dataset from s3 which has the cleaned
+    descriptions for the epc dataset
+
+    This data is stored in MessagePack format and therefore needs to be decoded
+    :return:
+    """
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    return cleaned
+
+
+def create_recommendation_scoring_data(
+    property: Property,
+    recommendation: dict,
+    starting_epc_data: pd.DataFrame,
+    ending_epc_data: pd.DataFrame,
+    fixed_data: pd.DataFrame,
+):
    """
    This wrapper function prepares data to be passed to the sap model api
    :return:
    """

+    scoring_dict = {
+        "UPRN": property.data["uprn"],
+        "id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
+        "LOCAL_AUTHORITY": property.data["local-authority"],
+        **starting_epc_data.to_dict("records")[0],
+        **ending_epc_data.to_dict("records")[0],
+        **fixed_data.to_dict("records")[0]
+    }
+
+    # We update the description to indicate it's insulated
+    if recommendation["type"] == "wall_insulation":
+        scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
+    elif recommendation["type"] == "floor_insulation":
+        scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
+    else:
+        raise NotImplementedError("Implement me")
+
+    return scoring_dict
+

@router.post("/trigger")
 async def trigger_plan(body: PlanTriggerRequest):
    logger.info("Connecting to db")
-    Session = sessionmaker(bind=db_engine)
-    session = Session()
+    session = sessionmaker(bind=db_engine)()
+    created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

    try:
        session.begin()
@ -159,21 +196,17 @@ async def trigger_plan(body: PlanTriggerRequest):
        epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)

        plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
-
        input_properties = []
        for config in plan_input:
            # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
            # TODO: implment validation
-
            # Create a record in db
            property_id, is_new = create_property(
                session, portfolio_id=body.portfolio_id, address=config['address'], postcode=config['postcode']
            )
-
            # if a new record was not created, we don't produduce recommendations
            if not is_new:
                continue
-
            # TODO: Need to add heat demand target
            create_property_targets(
                session,
@ -195,37 +228,34 @@ async def trigger_plan(body: PlanTriggerRequest):
        if not input_properties:
            return Response(status_code=204)

-        logger.info("Getting EPC data")
+        logger.info("Getting EPC, coordinates and conservation area data")
        for p in input_properties:
            p.search_address_epc()
            p.set_year_built()

-        logger.info("Getting coordinates")
-        # This is placeholder, until the full dataset is loaded into the database
-        for p in input_properties:
            coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
            p.set_coordinates(coordinate_data)

-        logger.info("Check if property is in conservation area")
-        for p in input_properties:
            in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
                "is_in_conservation_area"
            )
            p.set_is_in_conservation_area(in_conservation_area)

        # The materials data could be cached or local so we don't need to make
-        # consistent requrests to the backend for
+        # consistent requests to the backend for
        # the same data
        # TODO: It might not be the best choice to store the materials data in a database table since thi
        #       table probably won't be very large and won't be updated that often. It might be better to
        #       store this data in s3 load it into memory when the app starts up. We will test this

+        logger.info("Reading in materials and cleaned datasets")
        materials = get_materials(session)
        materials_by_type = filter_materials(materials)
+        cleaned = get_cleaned()

        logger.info("Getting components and properties recommendations")

-        # TODO: Move this to a class. We probably was a Recommender class which takes the injects the optimisers
+        # TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
        #      in as a dependency and then the optimisers can take the input measures in as part of the setup() method
        recommendations = {}
        recommendations_scoring_data = []
@ -310,42 +340,33 @@ async def trigger_plan(body: PlanTriggerRequest):
            # TODO: We should use the cleaned data from get_components in the data rather than the raw
            #       values. We should create a method in Property which takes the EPC data and inserts the cleaned
            #       data
-            epc_data = p.data.copy()
-            epc_data = pd.DataFrame([epc_data])
-            epc_data.columns = [col.upper().replace("-", "_") for col in epc_data.columns]
-            starting_epc_data = epc_data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix("_STARTING")
-            ending_epc_data = epc_data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix("_ENDING")
-            fixed_data = epc_data[FIXED_FEATURES]
+
+            data_processor = DataProcessor(None, newdata=True)
+            data_processor.insert_data(pd.DataFrame([p.data.copy()]))
+            data_processor.pre_process()
+
+            starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
+            ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
+            fixed_data = data_processor.get_fixed_features()

            # We update the ending record with the recommended updates and we set lodgement date to today
-            ending_epc_data["LODGEMENT_DATE_ENDING"] = datetime.now().strftime("%Y-%m-%d")
+            ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at

-            scoring_map = {
-                'Solid brick, as built, no insulation (assumed)': 'Solid brick, as built, insulated (assumed)',
-                'Suspended, no insulation (assumed)': 'Suspended, insulated (assumed)',
-                'Solid, no insulation (assumed)': 'Solid, insulated (assumed)',
-            }
            for recommendations_by_type in property_recommendations:
                for rec in recommendations_by_type:
-                    scoring_dict = {
-                        "UPRN": p.data["uprn"],
-                        "id": "+".join([str(p.id), str(rec["recommendation_id"])]),
-                        "LOCAL_AUTHORITY": p.data["local-authority"],
-                        **starting_epc_data.to_dict("records")[0],
-                        **ending_epc_data.to_dict("records")[0],
-                        **fixed_data.to_dict("records")[0]
-                    }
-
-                    # We update the description to indicate it's insulated
-                    if rec["type"] == "wall_insulation":
-                        scoring_dict["WALLS_DESCRIPTION_ENDING"] = scoring_map[p.walls["clean_description"]]
-                    elif rec["type"] == "floor_insulation":
-                        scoring_dict["FLOOR_DESCRIPTION_ENDING"] = scoring_map[p.floor["clean_description"]]
-                    else:
-                        raise NotImplementedError("Implement me")
+                    scoring_dict = create_recommendation_scoring_data(
+                        property=p,
+                        recommendation=rec,
+                        starting_epc_data=starting_epc_data,
+                        ending_epc_data=ending_epc_data,
+                        fixed_data=fixed_data,
+                    )

                    recommendations_scoring_data.append(scoring_dict)

+        # cleanup
+        del data_processor
+
        logger.info("Preparing data for scoring in sap change api")
        recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)

@ -354,61 +375,31 @@ async def trigger_plan(body: PlanTriggerRequest):
        cleaning_data = read_parquet_from_s3(
            bucket_name=get_settings().DATA_BUCKET,
            file_key="sap_change_model/cleaning_dataset.parquet",
-        )
-        cleaning_data = cleaning_data.rename(columns={"local-authority": "LOCAL_AUTHORITY"})
-        # Merge the cleaning data onto recommendations_scoring_data
+        ).rename(columns={"local-authority": "LOCAL_AUTHORITY"})

-        recommendations_scoring_data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = recommendations_scoring_data[
-            ["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
-        ].replace("", None)
+        # Merge the cleaning data onto recommendations_scoring_data

        # Perform the same cleaning as in the model
        recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
            data_to_clean=recommendations_scoring_data,
            cleaning_data=cleaning_data,
            cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
+        ).drop(columns=["LOCAL_AUTHORITY"])
+
+        sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
+        file_location = sap_change_model_api.upload_scoring_data(
+            df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
        )
-        recommendations_scoring_data = recommendations_scoring_data.drop(columns=["LOCAL_AUTHORITY"])
-
-        # Note: We might need to perform the full pre-processing here
-        data_processor = DataProcessor(filepath=None)
-        data_processor.insert_data(recommendations_scoring_data)
-        data_processor.remap_columns()
-        recommendations_scoring_data = data_processor.data
-
-        # Remap column types
-        recommendations_scoring_data = recommendations_scoring_data.astype(columntypes)
-        # Store parquet file in s3 for scoring
-        created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-        file_location = "sap_change_predictions/{portfolio_id}/{timestamp}.parquet".format(
-            portfolio_id=body.portfolio_id,
-            timestamp=created_at
-        )
-
-        logger.info("Storing scoring data to s3")
-        save_dataframe_to_s3_parquet(
-            df=recommendations_scoring_data,
-            bucket_name=get_settings().DATA_BUCKET,
-            file_key=file_location
-        )
-
-        logger.info("Making request to sap change api")
-        sap_change_model_api = SAPChangeModelAPI()
        response = sap_change_model_api.predict(
            file_location="s3://{DATA_BUCKET}/".format(DATA_BUCKET=get_settings().DATA_BUCKET) + file_location,
-            created_at=created_at,
-            portfolio_id=body.portfolio_id
        )

        # Retrieve the predictions
-        predictions = pd.DataFrame(read_csv_from_s3(
-            bucket_name=get_settings().PREDICTIONS_BUCKET,
-            filepath=response["storage_filepath"]
-        ))
+        predictions = pd.DataFrame(
+            read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
+        )

-        # We round the predictions
-        predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(0)
-        # Extract property_id and recommendation_id
+        predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
        predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)

        # Insert the predictions into the recommendations and run the optimiser
@ -424,7 +415,7 @@ async def trigger_plan(body: PlanTriggerRequest):
                        rec["recommendation_id"]
                    )]["RDSAP_CHANGE"].values[0]

-                    if not rec["sap_points"]:
+                    if rec["sap_points"] is None:
                        raise ValueError("Sap points missing")

            input_measures = prepare_input_measures(recommendations[property_id], body.goal)
--- a/backend/app/plan/temp_cleaned_data.py
+++ b/backend/app/plan/temp_cleaned_data.py
--- a/backend/ml_models/sap_change_model/api.py
+++ b/backend/ml_models/sap_change_model/api.py
@ -1,32 +1,71 @@
+import pandas as pd
 import requests
 from requests.exceptions import RequestException
 from utils.logger import setup_logger
+from utils.s3 import save_dataframe_to_s3_parquet

 logger = setup_logger()


 class SAPChangeModelAPI:
-    def __init__(self, base_url="https://api.dev.hestia.homes"):
+    def __init__(
+        self,
+        portfolio_id,
+        timestamp,
+        base_url="https://api.dev.hestia.homes",
+    ):
+        """
+        property_id (int, optional): :
+        :param portfolio_id: The portfolio ID to be passed in the request payload. Defaults to 4.
+        :param timestamp: The creation timestamp to be passed in the request payload. Defaults to None.
+        :param base_url:
+        """
        self.base_url = base_url
+        self.portfolio_id = portfolio_id
+        self.timestamp = timestamp

-    def predict(self, file_location, property_id="", portfolio_id=4, created_at=None):
+    def upload_scoring_data(self, df: pd.DataFrame, bucket: str) -> str:
+        """
+        The sap model api needs a scoring data that is sitting in s3 to use as a dataset to score on
+        This method allows the user to upload a table as a parquet file. This method will return the file
+        location, which can be used as the file location in the predict() method
+
+        :param df:  Pandas dataframe with scoring data to be uploaded to s3
+        :param bucket: Name of the bucket in s3 to upload to
+        :return:
+        """
+
+        # Store parquet file in s3 for scoring
+        file_location = "sap_change_predictions/{portfolio_id}/{timestamp}.parquet".format(
+            portfolio_id=self.portfolio_id,
+            timestamp=self.timestamp
+        )
+
+        logger.info("Storing scoring data to s3")
+        save_dataframe_to_s3_parquet(
+            df=df,
+            bucket_name=bucket,
+            file_key=file_location
+        )
+
+        return file_location
+
+    def predict(self, file_location):
        """Makes a POST request to the SAP Change Model API with the provided parameters.

        Args:
            file_location (str): The file location to be passed in the request payload.
-            property_id (int, optional): The property ID to be passed in the request payload. Defaults to 999.
-            portfolio_id (int, optional): The portfolio ID to be passed in the request payload. Defaults to 4.
-            created_at (str, optional): The creation timestamp to be passed in the request payload. Defaults to None.

        Returns:
            dict: The API response as a dictionary if the request was successful, None otherwise.
        """
+        logger.info("Making request to sap change api")
        url = f"{self.base_url}/sapmodel/predict"
        payload = {
            "file_location": f"s3://retrofit-data-dev/{file_location}",
-            "property_id": property_id,
-            "portfolio_id": portfolio_id,
-            "created_at": created_at
+            "property_id": "",  # This should get removed
+            "portfolio_id": self.portfolio_id,
+            "created_at": self.timestamp
        }

        try:
--- a/backend/requirements/base.txt
+++ b/backend/requirements/base.txt
@ -1,3 +1,4 @@
+msgpack==1.0.5
 anyio==3.7.1
 cffi==1.15.1
 click==8.1.3
--- a/backend/tests/test_property.py
+++ b/backend/tests/test_property.py
@ -2,7 +2,7 @@ import pytest
 import pandas as pd
 from unittest.mock import Mock
 from epc_api.client import EpcClient
-from model_data.Property import Property
+from backend.Property import Property
 from open_uprn.OpenUprnClient import OpenUprnClient
 from model_data.EpcClean import EpcClean

@ -19,6 +19,18 @@ mock_epc_response = {
            "hotwater-description": "Hot Water Description",
            "transaction-type": "rental",
            "lighting-description": "Good Lighting Efficiency",
+            "energy-consumption-current": "50",
+            "co2-emissions-current": "123",
+            "mechanical-ventilation": "natural",
+            'photo-supply': 0,
+            "solar-water-heating-flag": "N",
+            "wind-turbine-count": 0,
+            "extension-count": 0,
+            "heat-loss-corridor": "no corridor",
+            "unheated-corridor-length": 0,
+            "mains-gas-flag": "Y",
+            "floor-height": 2.5,
+            "total-floor-area": 100
        },
        {
            "inspection-date": "2023-05-01",
@ -30,6 +42,18 @@ mock_epc_response = {
            "hotwater-description": "Hot Water Description",
            "transaction-type": "rental",
            "lighting-description": "Good Lighting Efficiency",
+            "energy-consumption-current": "50",
+            "co2-emissions-current": "123",
+            "mechanical-ventilation": "natural",
+            'photo-supply': 0,
+            "solar-water-heating-flag": "N",
+            "wind-turbine-count": 0,
+            "extension-count": 0,
+            "heat-loss-corridor": "no corridor",
+            "unheated-corridor-length": 0,
+            "mains-gas-flag": "Y",
+            "floor-height": 2.5,
+            "total-floor-area": 100
        }
    ]
 }
@ -42,6 +66,18 @@ mock_epc_response_dupe = {
            'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
            "transaction-type": "rental",
            "lighting-description": "Good Lighting Efficiency",
+            "energy-consumption-current": "50",
+            "co2-emissions-current": "123",
+            "mechanical-ventilation": "natural",
+            'photo-supply': 0,
+            "solar-water-heating-flag": "N",
+            "wind-turbine-count": 0,
+            "extension-count": 0,
+            "heat-loss-corridor": "no corridor",
+            "unheated-corridor-length": 0,
+            "mains-gas-flag": "Y",
+            "floor-height": 2.5,
+            "total-floor-area": 100
        },
        {
            'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
@ -50,6 +86,18 @@ mock_epc_response_dupe = {
            'hotwater-description': 'Hot Water Description',
            "transaction-type": "rental",
            "lighting-description": "Good Lighting Efficiency",
+            "energy-consumption-current": "50",
+            "co2-emissions-current": "123",
+            "mechanical-ventilation": "natural",
+            'photo-supply': 0,
+            "solar-water-heating-flag": "N",
+            "wind-turbine-count": 0,
+            "extension-count": 0,
+            "heat-loss-corridor": "no corridor",
+            "unheated-corridor-length": 0,
+            "mains-gas-flag": "Y",
+            "floor-height": 2.5,
+            "total-floor-area": 100
        },
        {
            'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
@ -58,6 +106,18 @@ mock_epc_response_dupe = {
            'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
            "transaction-type": "rental",
            "lighting-description": "Good Lighting Efficiency",
+            "energy-consumption-current": "50",
+            "co2-emissions-current": "123",
+            "mechanical-ventilation": "natural",
+            'photo-supply': 0,
+            "solar-water-heating-flag": "N",
+            "wind-turbine-count": 0,
+            "extension-count": 0,
+            "heat-loss-corridor": "no corridor",
+            "unheated-corridor-length": 0,
+            "mains-gas-flag": "Y",
+            "floor-height": 2.5,
+            "total-floor-area": 100
        }
    ]
 }
@ -66,11 +126,11 @@ mock_epc_response_dupe = {
 class TestProperty:
    @pytest.fixture(autouse=True)
    def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
-        return Property("AB12CD", "Test Address", epc_client=mock_epc_client)
+        return Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)

    @pytest.fixture(autouse=True)
    def property_instance_dupe_data(self, mock_epc_client_dupe_data):
-        return Property("AB12CD", "Test Address", epc_client=mock_epc_client_dupe_data)
+        return Property(2, "AB12CD", "Test Address", epc_client=mock_epc_client_dupe_data)

    @pytest.fixture
    def mock_epc_client(self):
@ -99,15 +159,26 @@ class TestProperty:

    @pytest.fixture
    def mock_cleaner(self):
-        mock_cleaner = Mock(spec=EpcClean(data=[
-            {"roof-description": "Roof Description"},
-            {"walls-description": "Walls Description"},
-            {"windows-description": "Windows Description"},
-            {"mainheat-description": "Main Heating Description"},
-            {"hotwater-description": "Hot Water Description"},
-            {"lighting-description": "Good Lighting Efficiency"},
-            {"low-energy-lighting": 0}
-        ]))
+        lighting_averages = [
+            {'lighting-description': 'good lighting efficiency', 'low-energy-lighting': 99.26666666666667},
+            {'lighting-description': 'excellent lighting efficiency', 'low-energy-lighting': 100.0},
+            {'lighting-description': 'below average lighting efficiency', 'low-energy-lighting': 0.0}
+        ]
+
+        cleaner_spec = EpcClean(
+            data=[
+                {"roof-description": "Roof Description"},
+                {"walls-description": "Walls Description"},
+                {"windows-description": "Windows Description"},
+                {"mainheat-description": "Main Heating Description"},
+                {"hotwater-description": "Hot Water Description"},
+                {"lighting-description": "Good Lighting Efficiency"},
+                {"low-energy-lighting": 0}
+            ],
+            lighting_averages=lighting_averages
+        )
+
+        mock_cleaner = Mock(spec=cleaner_spec)
        mock_cleaner.cleaned = {
            "roof-description": [{"original_description": "Roof Description"}],
            "walls-description": [{"original_description": "Walls Description"}],
@ -119,14 +190,14 @@ class TestProperty:
        return mock_cleaner

    def test_init(self, mock_epc_client):
-        inst1 = Property("AB12CD", "Test Address", epc_client=mock_epc_client)
+        inst1 = Property(0, "AB12CD", "Test Address", epc_client=mock_epc_client)
        # Should be mocked auth token
        assert inst1.epc_client.auth_token == "mocked_auth_token"

-        inst2 = Property("AB12CD", "Test Address")
+        inst2 = Property(3, "AB12CD", "Test Address")
        assert inst2.epc_client.auth_token

-        inst3 = Property("AB12CD", "Test Address", data={"some": "data"})
+        inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
        assert inst3.data == {"some": "data"}

        data = inst3.search_address_epc()
@ -143,29 +214,9 @@ class TestProperty:
        with pytest.raises(Exception, match="More than one result found for this address - investigate me"):
            property_instance_dupe_data.search_address_epc()

-    def test_get_coordinates(self, property_instance, mock_open_uprn_client):
-        # Set up the mock OpenUprnClient
-        property_instance.data = {"uprn": 12345}
-        property_instance.get_coordinates(mock_open_uprn_client)
-
-        # Verify that the coordinates are set correctly
-        assert property_instance.coordinates == {
-            "uprn": 12345,
-            "longitude": 1.2345,
-            "latitude": 2.3456
-        }
-
-    def test_get_coordinates_without_open_uprn_data(self, property_instance, mock_open_uprn_client):
-        # Modify the mock OpenUprnClient to not have read any data
-        mock_open_uprn_client.data = None
-
-        # Verify that ValueError is raised when OpenUprnClient data is None
-        with pytest.raises(ValueError, match="OpenUprnClient has not read data"):
-            property_instance.get_coordinates(mock_open_uprn_client)
-
    def test_get_components(self, property_instance, mock_cleaner, mock_epc_client):
        property_instance.search_address_epc()
-        property_instance.get_components(mock_cleaner)
+        property_instance.get_components(mock_cleaner.cleaned)

        # Verify that the components are set correctly
        assert property_instance.roof == {"original_description": "Roof Description"}
@ -180,7 +231,7 @@ class TestProperty:

        # Verify that ValueError is raised when EpcClean doesn't contain cleaned data
        with pytest.raises(ValueError, match="Cleaner does not contain cleaned data"):
-            property_instance.get_components(mock_cleaner)
+            property_instance.get_components(mock_cleaner.cleaned)

    def test_get_components_no_data(self, property_instance, mock_cleaner):
        # Modify the mock cleaner to have no attributes for a specific description
@ -190,7 +241,7 @@ class TestProperty:

        # Verify that ValueError is raised when no attributes are found
        with pytest.raises(ValueError, match="Property does not contain data"):
-            property_instance.get_components(mock_cleaner)
+            property_instance.get_components(mock_cleaner.cleaned)

    def test_get_components_no_attributes(self, property_instance, mock_cleaner):
        # Modify the mock cleaner to have no attributes for a specific description
@ -201,12 +252,12 @@ class TestProperty:

        # Verify that ValueError is raised when no attributes are found
        with pytest.raises(ValueError, match="Either No attributes or multiple found for roof-description"):
-            property_instance.get_components(mock_cleaner)
+            property_instance.get_components(mock_cleaner.cleaned)

    def test_get_components_multiple_attributes(self, property_instance, mock_cleaner):
        # This shouldn't happen - it would mean a cleaning error
        property_instance.search_address_epc()
-        mock_cleaner.cleaned = {
+        cleaned = {
            "roof-description": [
                {"original_description": "Roof Description"},
                {"original_description": "Roof Description"}
@ -215,4 +266,4 @@ class TestProperty:

        # Verify that ValueError is raised when multiple attributes are found
        with pytest.raises(ValueError, match="Either No attributes or multiple found for roof-description"):
-            property_instance.get_components(mock_cleaner)
+            property_instance.get_components(cleaned)
--- a/model_data/cleaner_app.py
+++ b/model_data/cleaner_app.py
@ -7,7 +7,7 @@ from model_data.EpcClean import EpcClean
 from model_data.analysis.UvalueEstimations import UvalueEstimations
 from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
 from pathlib import Path
-from model_data.utils import save_data_to_s3
+from utils.s3 import save_data_to_s3

 LAND_REGISTRY_PATHS = [
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
--- a/model_data/epc_attributes/LightingAttributes.py
+++ b/model_data/epc_attributes/LightingAttributes.py
@ -13,14 +13,28 @@ class LightingAttributes:
    def __init__(self, description, averages):
        self.description: str = clean_description(description.lower())

-        translation = self.WELSH_TEXT.get(self.description)
-        if translation:
-            self.nodata = False
-            self.description = translation
+        self.welsh_translation_search()

        self.description = correct_spelling(self.description)
        self.averages = averages

+    def welsh_translation_search(self):
+        """
+        For welsh text describing the percentage of low energy lighting, we match the regular
+        expression and perform the translation
+        """
+        lel_match = re.search(r"goleuadau ynni-isel mewn (\d+)%? ogçör mannau gosod", self.description)
+
+        if lel_match:
+            # Perform the actual translation
+            percentage = lel_match.group(1)
+            self.description = f"low energy lighting in {percentage}% of fixed outlets"
+        else:
+            translation = self.WELSH_TEXT.get(self.description)
+            if translation:
+                self.nodata = False
+                self.description = translation
+
    def process(self):

        description = self.description
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -12,6 +12,9 @@ from model_data.simulation_system.core.Settings import (
    FLOOR_LEVEL_MAP,
    BUILT_FORM_REMAP,
    COLUMNS_TO_MERGE_ON,
+    COMPONENT_FEATURES,
+    FIXED_FEATURES,
+    COLUMNTYPES
 )
 from typing import List

@ -21,9 +24,16 @@ class DataProcessor:
    Handle data loading and data preprocessing
    """

-    def __init__(self, filepath: Path | None) -> None:
+    def __init__(self, filepath: Path | None, newdata: bool = False) -> None:
+        """
+        :param filepath: If specified, is the physical location of the data
+        :param newdata: Indicates if we are processing new, testing data.
+                        In this instance, there are some operations we do not
+                        want to perform, such as confine_data()
+        """
        self.filepath = filepath
        self.data = None
+        self.newdata = newdata

    def load_data(self, low_memory=False) -> None:
        if not self.filepath:
@ -140,14 +150,30 @@ class DataProcessor:
                    break
                to_index -= 1

+    def reformat_columns(self):
+        """
+        This function applies the re-formattng of columns from lower case to capitalised
+
+        When requesting the epc data from the api, the columns are lower case
+        and separated by a hyphen, whereas in the bulk download, the columns
+        are capitalised and separated by underscores. If rename_columns is True
+        we convert the columns from lower case to capitalised format
+        :return:
+        """
+        self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
+
    def pre_process(self) -> pd.DataFrame:
        """
        Load data and begin initial cleaning
        """
-        if not self.data:
+        if self.data is None:
            self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])

-        self.confine_data()
+        if self.newdata:
+            self.reformat_columns()
+
+        if not self.newdata:
+            self.confine_data()

        # We have some non-standard construction age bands which we'll clean for matching
        self.standardise_construction_age_band()
@ -159,9 +185,10 @@ class DataProcessor:
        self.clean_multi_glaze_proportion()
        self.clean_photo_supply()

-        self.retain_multiple_epc_properties(
-            epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
-        )
+        if not self.newdata:
+            self.retain_multiple_epc_properties(
+                epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
+            )
        self.remap_columns()

        if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1:
@ -169,6 +196,8 @@ class DataProcessor:
            self.fill_na_fields()

        self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+        # Final re-casting after data transformed and prepared
+        self.data = self.data.astype(COLUMNTYPES)

        return self.data

@ -178,6 +207,7 @@ class DataProcessor:
        """
        # Each uprn can fille backward from recent and forward fill from oldest
        # The groupby changes the order and we use the index to make the original data
+
        filled_data = (
            self.data.groupby("UPRN", group_keys=True)[columns_to_fill]
            .apply(lambda group: group.fillna(method="bfill").fillna(method="ffill"))
@ -188,6 +218,11 @@ class DataProcessor:

        self.data[columns_to_fill] = filled_data[columns_to_fill]

+        # For floor area, we also replace "" values with None
+        self.data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = self.data[
+            ["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
+        ].replace("", None)
+
    def remap_columns(self):
        """
        Remap all columns, for any non values
@ -430,3 +465,24 @@ class DataProcessor:
            data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True)

        return data_to_clean
+
+    def get_component_features(self, suffix: str) -> pd.DataFrame:
+        """
+        This function will return the property components such as the walls, roof, heating etc
+        as well as lodgement date. These are features that we expect might change from one EPC to the
+        next
+        :param suffix: Should be one of "_STARTING" or "_ENDING"
+        :return: Pandas dataframe containing the subset of columns defined in COMPONENT_FEATURES
+        """
+
+        if suffix not in ["_STARTING", "_ENDING"]:
+            raise Exception("Suffix should be one of _STARTING or _ENFING")
+
+        return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
+
+    def get_fixed_features(self) -> pd.DataFrame:
+        """
+        Returns the fixed features that we don't believe should vary from one EPC to the next
+        :return: Pandas dataframe containing the columns defined in FIXED_FEATURES
+        """
+        return self.data[FIXED_FEATURES]
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"

 RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
 HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
+CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"


 def ordinal(n):
@ -167,3 +168,29 @@ DATA_PROCESSOR_SETTINGS = {
    "epc_minimum_count": 1,
    "column_mappings": {"UPRN": [int, str]},
 }
+
+# This has a manual mapping of the column types required
+COLUMNTYPES = {
+    'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
+    'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
+    'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', 'FLOOR_LEVEL': 'float64',
+    'CONSTRUCTION_AGE_BAND': 'object',
+    'TRANSACTION_TYPE': 'object',
+    'WALLS_DESCRIPTION': 'object',
+    'FLOOR_DESCRIPTION': 'object',
+    'LIGHTING_DESCRIPTION': 'object',
+    'ROOF_DESCRIPTION': 'object',
+    'MAINHEAT_DESCRIPTION': 'object',
+    'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object',
+    'MECHANICAL_VENTILATION': 'object',
+    'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object',
+    'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64',
+    'WINDOWS_DESCRIPTION': 'object',
+    'GLAZED_TYPE': 'object',
+    'MULTI_GLAZE_PROPORTION': 'float64',
+    'LOW_ENERGY_LIGHTING': 'float64',
+    'NUMBER_OPEN_FIREPLACES': 'float64',
+    'MAINHEATCONT_DESCRIPTION': 'object',
+    'EXTENSION_COUNT': 'float64',
+    'LODGEMENT_DATE': 'object',
+}
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -1,20 +1,161 @@
 import pandas as pd
 from tqdm import tqdm
+import msgpack

 from pathlib import Path
-from simulation_system.core.Settings import (
+from model_data.simulation_system.core.Settings import (
    MANDATORY_FIXED_FEATURES,
    LATEST_FIELD,
    COMPONENT_FEATURES,
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
-    EARLIEST_EPC_DATE
+    EARLIEST_EPC_DATE,
+    CARBON_RESPONSE,
 )
-from simulation_system.core.DataProcessor import DataProcessor
-from utils import save_dataframe_to_s3_parquet
+from model_data.simulation_system.core.DataProcessor import DataProcessor
+from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3

-DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
+DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
+
+
+def get_cleaned():
+    """
+    This function will retrieve the cleaned dataset from s3 which has the cleaned
+    descriptions for the epc dataset
+
+    This data is stored in MessagePack format and therefore needs to be decoded
+    :return:
+    """
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    return cleaned
+
+
+def process_and_prune_desriptions(df, cleaned_lookup):
+    """
+    This method will merge on the cleaned lookup table and ensure that the building fabric in the
+    starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest
+    possible dataset.
+    :param df:
+    :param cleaned_lookup:
+    :return:
+    """
+
+    # TODO: In a future iteration, we can test using the binary features and the insulation thickness
+    #       estimates, we well as estimated U-values
+
+    cols_to_drop = {
+        "walls": [
+            'original_description', 'thermal_transmittance',
+            'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
+            'is_solid_brick', 'is_system_built', 'is_timber_frame',
+            'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
+            'is_sandstone_or_limestone', 'insulation_thickness',
+            'external_insulation', 'internal_insulation',
+            'original_description_ENDING',
+            'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+            'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
+            'is_solid_brick_ENDING', 'is_system_built_ENDING',
+            'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
+            'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
+            'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
+            'external_insulation_ENDING', 'internal_insulation_ENDING',
+        ],
+        "floor": [
+            'original_description', 'thermal_transmittance',
+            'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
+            'is_to_external_air', 'is_suspended', 'is_solid',
+            'another_property_below', 'insulation_thickness', 'no_data',
+            'original_description_ENDING',
+            'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+            'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
+            'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
+            'another_property_below_ENDING', 'insulation_thickness_ENDING',
+            'no_data_ENDING',
+        ],
+        "roof": [
+            'original_description', 'clean_description', 'thermal_transmittance',
+            'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
+            'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
+            'has_dwelling_above', 'is_valid', 'insulation_thickness',
+            'original_description_ENDING', 'clean_description_ENDING',
+            'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+            'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
+            'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
+            'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
+            'insulation_thickness_ENDING',
+        ]
+
+    }
+
+    for component in ["walls", "floor", "roof"]:
+        component_upper = component.upper()
+
+        df = df.merge(
+            pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+            how="left",
+            left_on=f"{component_upper}_DESCRIPTION_STARTING",
+            right_on="original_description",
+        ).merge(
+            pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+            how="left",
+            left_on=f"{component_upper}_DESCRIPTION_ENDING",
+            right_on="original_description",
+            suffixes=("", "_ENDING")
+        )
+
+        if component == "walls":
+            # We make sure the wall construction hasn't changed
+            df = df[
+                (df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
+                (df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
+                (df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
+                (df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
+                (df["is_cob"] == df["is_cob_ENDING"]) &
+                (df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
+                ]
+        elif component == "floor":
+            df = df[
+                (df["is_suspended"] == df["is_suspended_ENDING"]) &
+                (df["is_solid"] == df["is_solid_ENDING"]) &
+                (df["another_property_below"] == df["another_property_below_ENDING"]) &
+                (df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
+                ]
+        else:
+            df = df[
+                (df["is_pitched"] == df["is_pitched_ENDING"]) &
+                (df["is_roof_room"] == df["is_roof_room_ENDING"]) &
+                (df["is_loft"] == df["is_loft_ENDING"]) &
+                (df["is_flat"] == df["is_flat_ENDING"]) &
+                (df["is_thatched"] == df["is_thatched_ENDING"]) &
+                (df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
+                (df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
+                ]
+
+        # Drop the binary indicators and replace the original description with the cleaned version
+
+        # Drop original cols
+        original_cols = [
+            f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
+        ]
+
+        df = df.drop(
+            columns=cols_to_drop[component] + original_cols
+        ).rename(
+            columns={
+                "clean_description": f"{component_upper}_DESCRIPTION_STARTING",
+                "clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
+            }
+        )
+
+    return df


 def app():
@ -23,6 +164,8 @@ def app():
    # Data glossary:
    # https://epc.opendatacommunities.org/docs/guidance#glossary

+    cleaned_lookup = get_cleaned()
+
    # List all subdirectories
    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

@ -38,7 +181,7 @@ def app():
    # TODO [x] : Have a look at temporal features
    # TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
    # TODO [x]: Same as floor area for floor height
-    # TODO []: If fundamental building fabric changes, we should proabably discard the record
+    # TODO [x]: If fundamental building fabric changes, we should proabably discard the record
    # TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
    #           - leave for now and check performance after temporal features
    # TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
@ -84,7 +227,7 @@ def app():
            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
            variable_data = modified_property_data[
-                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
+                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
@ -104,24 +247,29 @@ def app():
                if gets_better:
                    starting_sap = earliest_record[RDSAP_RESPONSE]
                    starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
+                    starting_carbon = earliest_record[CARBON_RESPONSE]
+
                    rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
                    heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
-                else:
-                    starting_sap = latest_record[RDSAP_RESPONSE]
-                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
-                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
-                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+                    carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon

-                if rdsap_change == 0:
-                    continue
-
-                if gets_better:
                    starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                    ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
                else:
+                    starting_sap = latest_record[RDSAP_RESPONSE]
+                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
+                    starting_carbon = latest_record[CARBON_RESPONSE]
+
+                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
+                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+                    carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
+
                    starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                    ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")

+                if rdsap_change == 0:
+                    continue
+
                features = pd.concat([starting_record, ending_record])

                property_model_data.append(
@ -129,8 +277,10 @@ def app():
                        "UPRN": uprn,
                        "RDSAP_CHANGE": rdsap_change,
                        "HEAT_DEMAND_CHANGE": heat_demand_change,
-                        "STARTING_SAP": starting_sap,
-                        "STARTING_HEAT_DEMAND": starting_heat_demand,
+                        "CARBON_CHANGE": carbon_change,
+                        "SAP_STARTING": starting_sap,
+                        "HEAT_DEMAND_STARTING": starting_heat_demand,
+                        "CARBON_STARTING": starting_carbon,
                        **fixed_data,
                        **features.to_dict(),
                    }
@ -152,6 +302,14 @@ def app():
        #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
        #       within descriptions

+        # We look for key building fabric features that have changed from one EPC to the next.
+        # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
+        # remove this record, as it indicates that the quality of the EPC conducted in the first instance
+        # is low
+        # We also replace descriptions with their cleaned variants
+
+        data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
+
        dataset.append(data_by_urpn_df)

        cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
--- a/model_data/utils.py
+++ b/model_data/utils.py
@ -1,7 +1,3 @@
-import boto3
-from botocore.exceptions import NoCredentialsError, PartialCredentialsError
-import pandas as pd
-from io import BytesIO
 import re
 from textblob import TextBlob

@ -28,65 +24,3 @@ def correct_spelling(text):

    corrected_text = ' '.join(corrected_words)
    return corrected_text
-
-
-def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
-    """
-    Save a pandas DataFrame to S3 as a Parquet file.
-
-    :param df: The pandas DataFrame.
-    :param bucket_name: Name of the S3 bucket.
-    :param file_key: Key of the file (including directory path within the bucket).
-    """
-
-    # Convert the DataFrame to a Parquet format in memory
-    parquet_buffer = BytesIO()
-    df.to_parquet(parquet_buffer)
-
-    # Create the boto3 client
-    client = boto3.client('s3')
-
-    # Upload the Parquet file to S3
-    client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
-
-
-def save_data_to_s3(data, bucket_name, s3_file_name):
-    """
-    Save an object to an S3 bucket
-
-    :param data: The data to save
-    :param bucket_name: The name of the S3 bucket
-    :param s3_file_name: The file name to use for the saved data in S3
-    """
-    # Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
-    try:
-        s3 = boto3.client('s3')
-    except NoCredentialsError:
-        print("Credentials not available.")
-        return
-    except PartialCredentialsError:
-        print("Incomplete credentials provided.")
-        return
-
-    try:
-        s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
-        print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
-    except Exception as e:
-        print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
-
-
-def read_from_s3(bucket_name, s3_file_name):
-    """
-    Read an object from s3. Decoding of the data is left for outside of this function
-
-    :param bucket_name: The name of the S3 bucket
-    :param s3_file_name: The file name to use for the saved data in S3
-    """
-    # Initialize a session using Amazon S3
-    s3 = boto3.resource('s3')
-
-    # Get the MessagePack data from S3
-    obj = s3.Object(bucket_name, s3_file_name)
-    data = obj.get()['Body'].read()
-
-    return data
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +1,4 @@
 [pytest]
 pythonpath = .
 addopts = --cov-report term-missing --cov=model_data --cov=recommendations
-testpaths = model_data/tests recommendations/tests
+testpaths = model_data/tests recommendations/tests backend/tests
--- a/recommendations/config.py
+++ b/recommendations/config.py
@ -0,0 +1,8 @@
+# This map defines the upgrades that are possible to be recommended by the recommendation engine
+# For example,
+# TODO: once we use cleaned descriptions, this should be updated using the cleaned descriptions
+UPGRADES_MAP = {
+    'Solid brick, as built, no insulation (assumed)': 'Solid brick, as built, insulated (assumed)',
+    'Suspended, no insulation (assumed)': 'Suspended, insulated (assumed)',
+    'Solid, no insulation (assumed)': 'Solid, insulated (assumed)',
+}
--- a/run_lambda_local.sh
+++ b/run_lambda_local.sh
@ -20,7 +20,7 @@ fi

 # Step 2: Build the Docker image
 echo "Building Docker image..."
-docker build -t $IMAGE_NAME:$TAG -f backend/docker/lambda.Dockerfile .
+docker build --platform linux/amd64 -t $IMAGE_NAME:$TAG -f backend/docker/lambda.Dockerfile .

 # Step 3: Run the Docker image with the emulator, .env file, and AWS credentials
 echo "Starting the Docker container..."
--- a/utils/s3.py
+++ b/utils/s3.py
@ -0,0 +1,65 @@
+import boto3
+from io import BytesIO
+from botocore.exceptions import NoCredentialsError, PartialCredentialsError
+
+
+def read_from_s3(bucket_name, s3_file_name):
+    """
+    Read an object from s3. Decoding of the data is left for outside of this function
+
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_name: The file name to use for the saved data in S3
+    """
+    # Initialize a session using Amazon S3
+    s3 = boto3.resource('s3')
+
+    # Get the MessagePack data from S3
+    obj = s3.Object(bucket_name, s3_file_name)
+    data = obj.get()['Body'].read()
+
+    return data
+
+
+def save_data_to_s3(data, bucket_name, s3_file_name):
+    """
+    Save an object to an S3 bucket
+
+    :param data: The data to save
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_name: The file name to use for the saved data in S3
+    """
+    # Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
+    try:
+        s3 = boto3.client('s3')
+    except NoCredentialsError:
+        print("Credentials not available.")
+        return
+    except PartialCredentialsError:
+        print("Incomplete credentials provided.")
+        return
+
+    try:
+        s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
+        print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
+    except Exception as e:
+        print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
+
+
+def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
+    """
+    Save a pandas DataFrame to S3 as a Parquet file.
+
+    :param df: The pandas DataFrame.
+    :param bucket_name: Name of the S3 bucket.
+    :param file_key: Key of the file (including directory path within the bucket).
+    """
+
+    # Convert the DataFrame to a Parquet format in memory
+    parquet_buffer = BytesIO()
+    df.to_parquet(parquet_buffer)
+
+    # Create the boto3 client
+    client = boto3.client('s3')
+
+    # Upload the Parquet file to S3
+    client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())