diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py
index 01dcce7a..8aa0cbf8 100644
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@@ -586,7 +586,7 @@ class EnergyConsumptionModel:
 
     def estimate_new_consumption(self, current_energy_efficiency, target_efficiency, current_consumption):
         """
-        Given then consumption_averages dataset, which is produced as a result of the data_combining.py script,
+        Given then consumption_averages dataset, which is produced as a result of the training_data.py script,
         for the energy kwh models, this function will estimate the new consumption based on the current consumption,
         based on the expected reduction in consumption from the current rating to the target rating.
         :param current_energy_efficiency:
diff --git a/etl/bill_savings/KwhData.py b/etl/bill_savings/KwhData.py
index ad7a375a..3c68f33f 100644
--- a/etl/bill_savings/KwhData.py
+++ b/etl/bill_savings/KwhData.py
@@ -1,5 +1,6 @@
 import re
 import pandas as pd
+import numpy as np
 from datetime import datetime
 from tqdm import tqdm
 from utils.logger import setup_logger
@@ -11,6 +12,23 @@ logger = setup_logger()
 class KwhData:
     COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
 
+    CATEGORICAL_COLUMNS = [
+        "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
+        "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
+        "built-form",
+        "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
+        "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
+        "county",
+        "windows-description", "windows-energy-eff", "flat-top-storey",
+        "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
+        "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
+    ]
+
+    NUMERICAL_COLUMNS = [
+        'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
+        'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
+    ]
+
     def __init__(self, bucket):
         self.run_date = datetime.now().strftime("%Y-%m-%d")
         self.bucket = bucket
@@ -18,6 +36,7 @@ class KwhData:
 
         self.consumption_data_filepath = None
         self.consumption_averages_filepath = None
+        self.model_training_data_filepath = None
 
     @staticmethod
     def extract_kwh_value(text: str):
@@ -116,3 +135,84 @@ class KwhData:
         )
 
         self.data = df
+
+    def transform(
+        self, data: pd.DataFrame, cleaned, new=False, save=False
+    ):
+        """
+        Given the input EPCs, this method will transform the data into a format that can be used by the model
+        This method can be used to transform the training data, or new epcs within the backend engine
+        :return:
+        """
+
+        # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
+        #       in anticipation of the new model
+
+        data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
+        data["lodgement-year"] = data["lodgement-date"].dt.year
+        data["lodgement-month"] = data["lodgement-date"].dt.month
+
+        # For walls, roof, floor description where we have average thermal transmittance, to avoid too many
+        # categories
+        # we group them
+        ranges = {
+            "lessthan 0.1": (0, 0.1),
+            "0.1 - 0.3": (0.1, 0.3),
+            "0.3 - 0.5": (0.3, 0.5),
+            "morethan 0.5": (0.5, 2.5),
+        }
+
+        # Generate the lookup table
+        thermal_transmittance_lookup_table = []
+        for i in range(1, 251):
+            value = i / 100
+            for label, (low, high) in ranges.items():
+                if low < value <= high:
+                    thermal_transmittance_lookup_table.append({"from": value, "to": label})
+                    break
+
+        # Convert to DataFrame for display
+        thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
+        thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
+
+        # Apply the lookup table to the data
+        for feature in ["walls-description", "roof-description", "floor-description"]:
+            cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
+            # Round to 2 decimal places and convert to string
+            cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
+
+            data = data.merge(
+                cleaned_df,
+                how="left",
+                left_on=feature,
+                right_on="original_description",
+            )
+            # We now have the thermal transmittance in the data, which we can use to group with the lookup table
+            data = data.merge(
+                thermal_transmittance_lookup_table,
+                how="left",
+                left_on="thermal_transmittance",
+                right_on="from",
+            )
+            # Where "to" is populated, replace feature with to
+            data[feature] = np.where(
+                ~pd.isnull(data["to"]),
+                data["to"],
+                data[feature]
+            )
+            data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
+
+        data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
+        data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
+
+        # Create new features:
+        data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
+
+        if save:
+            self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
+            logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}")
+            save_dataframe_to_s3_parquet(
+                bucket_name=self.bucket,
+                file_key=self.model_training_data_filepath,
+                df=data
+            )
diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py
index 0341b885..85a403f1 100644
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@@ -7,7 +7,7 @@ import inspect
 import pandas as pd
 from tqdm import tqdm
 from bs4 import BeautifulSoup
-from etl.epc.settings import EARLIEST_EPC_DATE
+from training_data.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
 import numpy as np
 from utils.s3 import save_pickle_to_s3
diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py
index df60298b..5d89a79e 100644
--- a/etl/bill_savings/training.py
+++ b/etl/bill_savings/training.py
@@ -1,7 +1,7 @@
 from pprint import pprint
 import msgpack
 from utils.s3 import read_from_s3
-from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
+from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
 
 
 def handler():
diff --git a/etl/bill_savings/data_combining.py b/etl/bill_savings/training_data.py
similarity index 51%
rename from etl/bill_savings/data_combining.py
rename to etl/bill_savings/training_data.py
index 970c92bf..85b53bca 100644
--- a/etl/bill_savings/data_combining.py
+++ b/etl/bill_savings/training_data.py
@@ -1,4 +1,6 @@
+import msgpack
 from etl.bill_savings.KwhData import KwhData
+from utils.s3 import read_from_s3
 
 
 def app():
@@ -8,5 +10,13 @@ def app():
     :return:
     """
 
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
     kwh_data_client = KwhData(bucket="retrofit-datalake-dev")
     kwh_data_client.combine()
+    kwh_data_client.transform(data=kwh_data_client.data, cleaned=cleaned, save=True)