breakout classes

2026-07-27 23:35:01 +00:00 · 2023-12-06 17:08:05 +00:00 · 2023-12-06 17:08:05 +00:00 · 353e8a90db
commit 353e8a90db
parent 05c01c1770
4 changed files with 533 additions and 354 deletions
--- a/etl/epc/Dataset.py
+++ b/etl/epc/Dataset.py
@ -0,0 +1,107 @@
+import pandas as pd
+from typing import List
+from etl.epc.EPCRecord import EPCDifferenceRecord
+
+class TrainingDataset:
+    """
+    A collection of EPCDifferenceRecords can be combined into a TrainingDataset.
+    """
+
+    def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
+        self.datasets = datasets
+        self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
+
+        self._feature_generation()
+        self._drop_features()
+        self._clean_dataframe()
+        self._clean_efficiency_variables(self.df)
+
+    def _drop_features(self):
+        """
+        Drop features that are not needed for modelling
+        """
+        self.df = self.df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
+
+
+    def _feature_generation(self):
+        """
+        Generate features for modelling
+        """
+        self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"])
+        self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"])
+
+    @staticmethod
+    def _clean_efficiency_variables(df):
+
+        """
+        These is scope to clean this by the model per corresponding description.
+        E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
+        fill in the missing values with this.
+        When looking at this initially, there are a large volume of records with missing energy efficiency
+        values and therefore a simpler approach was taken just to test including these variables
+        :param df:
+        :return:
+        """
+
+        missings = pd.isnull(df).sum()
+        missings = missings[missings >= 1]
+
+        if len(missings) == 0:
+            return df
+
+        # Make sure they are all efficiency columns
+        if any(~missings.index.str.contains("ENERGY_EFF")):
+            raise ValueError("Non efficiency columns are missing")
+
+        for m in missings.index:
+            df[m] = df[m].fillna("NO_RATING")
+
+        return df
+
+    @staticmethod
+    def _calculate_days_to(lodgement_date):
+
+        if isinstance(lodgement_date, str):
+            return (
+                pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
+            ).daye
+
+        return (
+            pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
+        ).dt.days
+
+    def __add__(self, other) -> "TrainingDataset":
+        if not isinstance(other, TrainingDataset):
+            raise TypeError("Addition can only be performed with another instance of TrainingDataset")
+        return TrainingDataset(self.datasets + other.datasets)
+        
+    def __radd__(self, other):
+        """
+        Required for sum() to work
+        """
+        if isinstance(other, int):
+            return self
+        else:
+            return self.__add__(other)
+
+class ScoringDataset:
+    """
+    A collection of EPCDifferenceRecords can be combined into a ScoringDataset.
+    """
+
+    def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
+        self.datasets = datasets
+
+    def __add__(self, other) -> "ScoringDataset":
+        if not isinstance(other, ScoringDataset):
+            raise TypeError("Addition can only be performed with another instance of ScoringDataset")
+        return ScoringDataset(self.datasets + other.datasets)
+        
+    def __radd__(self, other):
+        """
+        Required for sum() to work
+        """
+        if isinstance(other, int):
+            return self
+        else:
+            return self.__add__(other)
--- a/etl/epc/EPCRecord.py
+++ b/etl/epc/EPCRecord.py
@ -0,0 +1,290 @@
+
+from dataclasses import dataclass
+from etl.epc.ValidationConfiguration import (
+    EPCRecordValidationConfiguration, 
+    EPCDifferenceRecordValidationConfiguration, 
+    EPCDifferenceRecordFixedDataValidationConfiguration
+)
+from typing import Union, List
+from etl.epc.settings import (
+    RDSAP_RESPONSE,
+    HEAT_DEMAND_RESPONSE,
+    CARBON_RESPONSE,
+    COMPONENT_FEATURES,
+    EFFICIENCY_FEATURES
+)
+
+@dataclass
+class EPCRecord:
+    """
+    Base class for a EPC record
+    """
+    UPRN: str
+    WALLS_DESCRIPTION: str
+    FLOOR_DESCRIPTION: str
+    LIGHTING_DESCRIPTION: str
+    ROOF_DESCRIPTION: str
+    MAINHEAT_DESCRIPTION: str
+    HOTWATER_DESCRIPTION: str
+    MAIN_FUEL: str
+    MECHANICAL_VENTILATION: str
+    SECONDHEAT_DESCRIPTION: str
+    WINDOWS_DESCRIPTION: str
+    GLAZED_TYPE: str
+    MULTI_GLAZE_PROPORTION: float
+    LOW_ENERGY_LIGHTING: float
+    NUMBER_OPEN_FIREPLACES: float
+    MAINHEATCONT_DESCRIPTION: str
+    SOLAR_WATER_HEATING_FLAG: str
+    PHOTO_SUPPLY: float
+    TRANSACTION_TYPE: str
+    ENERGY_TARIFF: str
+    EXTENSION_COUNT: float
+    TOTAL_FLOOR_AREA: float
+    FLOOR_HEIGHT: float
+    HOT_WATER_ENERGY_EFF: str
+    FLOOR_ENERGY_EFF: str
+    WINDOWS_ENERGY_EFF: str 
+    WALLS_ENERGY_EFF: str
+    SHEATING_ENERGY_EFF: str
+    ROOF_ENERGY_EFF: str
+    MAINHEAT_ENERGY_EFF: str
+    MAINHEATC_ENERGY_EFF: str 
+    LIGHTING_ENERGY_EFF: str
+    POTENTIAL_ENERGY_EFFICIENCY: float
+    ENVIRONMENT_IMPACT_POTENTIAL: float
+    ENERGY_CONSUMPTION_POTENTIAL: float 
+    CO2_EMISSIONS_POTENTIAL: float
+    LODGEMENT_DATE: str
+    CURRENT_ENERGY_EFFICIENCY: int
+    ENERGY_CONSUMPTION_CURRENT: int
+    CO2_EMISSIONS_CURRENT: float
+
+    def __post_init__(self):
+        # We can have validation and cleaning steps for each of the fields
+        # self.WALLS_DESCRIPTION = 'check'
+        # Could also have cleaning of records if needed        
+        self.validation_configuration = EPCRecordValidationConfiguration
+
+        # self._field_validation()
+        pass
+
+    def _field_validation(self):
+        """
+        This method will validate each of the fields in the EPC record
+        """
+        
+        for record_key, validation_config in self.validation_configuration.items():
+            # Get the variable named record key from self
+            field_value = self.__dict__[record_key]
+
+            if validation_config['type'] == "string":
+                self._validate_string(record_key, field_value, validation_config)
+            elif validation_config['type'] == "float":
+                self._validate_float(field_value, validation_config)
+            else:
+                raise ValueError(f"Validation type {validation_config['type']} not supported")
+
+    def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict):
+        """
+        Validate a string field
+        """
+        if not isinstance(field_value, str):
+            raise ValueError(f"Field {record_key} has value {field_value} which is not a string")
+        
+        if 'function' in validation_config:
+            try:
+                validation_config['function'](field_value)
+            except:
+                raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
+
+        if validation_config['acceptable_values'] is not None:
+            if field_value not in validation_config['acceptable_values']:
+                raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}")
+    
+    def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict):
+        """
+        Validate a float field
+        """
+        if not isinstance(field_value, float):
+            raise ValueError(f"Field {record_key} has value {field_value} which is not a float")
+        
+        if 'function' in validation_config:
+            try:
+                validation_config['function'](field_value)
+            except:
+                raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
+        
+        if validation_config['range'] is not None:
+            if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]:
+                raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}")    
+    
+    def __sub__(self, other):
+        """
+        This method will return the difference between two EPC records
+        """
+        if not isinstance(other, EPCRecord):
+            raise ValueError("Can only subtract EPCRecord from EPCRecord")
+        
+        difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True)
+        
+        return difference_record
+    
+    def __gt__(self, other):
+        """
+        This method will return True if the EPC record is greater than or equal to the other
+        """
+        if not isinstance(other, EPCRecord):
+            raise ValueError("Can only compare EPCRecord to EPCRecord")
+        
+        return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE]
+    
+    def __ge__(self, other):
+        """
+        This method will return True if the EPC record is greater than or equal to the other
+        """
+        if not isinstance(other, EPCRecord):
+            raise ValueError("Can only compare EPCRecord to EPCRecord")
+        
+        return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE]
+        
+    def __lt__(self, other):
+        """
+        This method will return True if the EPC record is greater than or equal to the other
+        """
+        if not isinstance(other, EPCRecord):
+            raise ValueError("Can only compare EPCRecord to EPCRecord")
+        
+        return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE]
+    
+    def __le__(self, other):
+        """
+        This method will return True if the EPC record is greater than or equal to the other
+        """
+        if not isinstance(other, EPCRecord):
+            raise ValueError("Can only compare EPCRecord to EPCRecord")
+        
+        return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
+        
+    def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None):
+        """
+        This method will return the value of the key
+        """
+        if return_asdict:
+            output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key}
+            if key_suffix is not None:
+                output_dict = {f"{x}{key_suffix}": y for x, y in output_dict.items()}
+            return output_dict
+
+        if isinstance(key, list):
+            return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key]
+        elif isinstance(key, str):
+            return self.__dict__[key] if key in self.__dict__.keys() else None    
+        
+
+class EPCDifferenceRecord:
+    """
+    Base class for the difference between two EPC records
+    """
+
+    def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False):
+        """
+        This method will initialise the EPCDifferenceRecord
+        Defaults usage is with record2 to have the higher RDSAP score
+        """
+        self.record1 = record1
+        self.record2 = record2
+        self.difference_record = {}
+
+        self.difference_validation_configuration = EPCDifferenceRecordValidationConfiguration
+        self.fixed_data_validation_configuration = EPCDifferenceRecordFixedDataValidationConfiguration
+
+        if auto_sort and (self.record2 <= self.record1):
+            self.record1, self.record2 = self.record2, self.record1
+
+        self._construct_difference_record()
+        self._validate_difference_record()
+
+
+
+    def _construct_difference_record(self):
+        """
+        This method will construct the difference record between the two records
+        """
+
+        rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE)
+        heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE)
+        carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE)
+
+        component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
+        ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING")
+        starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING")
+
+        # TODO: DO we want to take the earliest potentials or max potentials?
+        self.difference_record = {
+            "UPRN": self.record1.get("UPRN"),
+            "RDSAP_CHANGE": rdsap_change,
+            "HEAT_DEMAND_CHANGE": heat_demand_change,
+            "CARBON_CHANGE": carbon_change,
+            "SAP_STARTING": self.record1.get(RDSAP_RESPONSE),
+            "SAP_ENDING": self.record2.get(RDSAP_RESPONSE),
+            "HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE),
+            "HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE),
+            "CARBON_STARTING": self.record1.get(CARBON_RESPONSE),
+            "CARBON_ENDING": self.record2.get(CARBON_RESPONSE),
+            "POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")),
+            "ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")),
+            "ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")),
+            "CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")),
+            **ending_record,
+            **starting_record
+        }
+
+    def _validate_difference_record(self):
+        """
+        This method will validate the difference record
+        """
+        # for key, value in self.difference_record.items():
+        #     if key == "LODGEMENT_DATE":
+        #         continue
+        #     if isinstance(value, str):
+        #         continue
+        #     if value < 0:
+        #         raise ValueError(f"Difference record has negative value for {key}")
+        pass
+            
+    def compare_fields_in_records(self, fields: List[str]):
+        """
+        This method will compare the records, for specific fields
+        """
+        
+        all_equal = True
+        for field in fields:
+            if self.record1.get(field) != self.record2.get(field):
+                return False
+    
+        if all_equal:
+            return True
+            
+    def get(self, key: str):
+        """
+        This method will return the value of the key
+        """
+        return self.difference_record[key] if key in self.difference_record.keys() else None    
+
+    def append_fixed_data(self, fixed_data: dict):
+        """
+        This method will append fixed data to the difference record
+        """        
+        self._validate_fixed_data(fixed_data)
+        self.difference_record.update(fixed_data)
+
+    def _validate_fixed_data(self, fixed_data: dict):
+        """
+        This method will validate the fixed data
+        """
+
+        # Can have more sophisticated checks here  
+        # self.fixed_data_validataion_configuration
+
+        pass
--- a/etl/epc/ValidationConfiguration.py
+++ b/etl/epc/ValidationConfiguration.py
@ -21,3 +21,37 @@ EPCRecordValidationConfiguration = {
        "range": [0, 100]
    }
 }
+
+EPCDifferenceRecordValidationConfiguration = {
+}
+
+EPCDifferenceRecordFixedDataValidationConfiguration = {
+    "PROPERTY_TYPE": {
+        "type": "string",
+        "acceptable_values": ["House", "Flat", "Bungalow", "Maisonette", "Park home", "Other"]
+    },
+    "BUILT_FORM": {
+        "type": "string",
+        "acceptable_values": ["Detached", "Semi-Detached", "End-Terrace", "Mid-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace", "Enclosed Detached", "Not applicable"]
+    },
+    "CONSITUENCY": {
+        "type": "string",
+        "acceptable_values": ["England", "Wales", "Scotland", "Northern Ireland"]
+    },
+    "NUMBER_HABITABLE_ROOMS": {
+        "type": "integer",
+        "range": [0, 100]
+    },
+    "NUMBER_HEATED_ROOMS": {
+        "type": "integer",
+        "range": [0, 100]
+    },
+    "FIXED_LIGHTING_OUTLETS_COUNT": {
+        "type": "integer",
+        "range": [0, 100]
+    },
+    "CONSTRUCTION_AGE_BAND": {
+        "type": "string",
+        "acceptable_values": []
+    }
+}
--- a/etl/epc/property_change_app.py
+++ b/etl/epc/property_change_app.py
@ -18,6 +18,7 @@ from etl.epc.settings import (
    MINIMUM_FLOOR_HEIGHT
 )
 from etl.epc.DataProcessor import DataProcessor
+from etl.epc.EPCRecord import EPCRecord, EPCDifferenceRecord
 from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
 from recommendations.rdsap_tables import england_wales_age_band_lookup
 from recommendations.recommendation_utils import (
@ -223,6 +224,7 @@ def make_uvalues(df):
    df["row_index"] = df.index

    uvalues = []
+    # TODO: iterrows is the slowest way to do this, we should use a vectorised approach or itertuples
    for _, x in df.iterrows():

        uprn = x["UPRN"]
@ -379,285 +381,23 @@ def make_uvalues(df):
    return df


-def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
-    """
-    For a list of columns, check if the earliest and latest record are the same
-    If they are the same, we indicate this, because we have example of SAP scores changing
-    without any feature changes
-    :param earliest_record: pd.Series
-    :param latest_record: pd.Series
-    :param columns: list of columns to compare
-    :return: boolean indicating whether or not all features are the same
-    """
+# def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
+#     """
+#     For a list of columns, check if the earliest and latest record are the same
+#     If they are the same, we indicate this, because we have example of SAP scores changing
+#     without any feature changes
+#     :param earliest_record: pd.Series
+#     :param latest_record: pd.Series
+#     :param columns: list of columns to compare
+#     :return: boolean indicating whether or not all features are the same
+#     """

-    all_equal = True
-    for col in columns:
-        if earliest_record[col] != latest_record[col]:
-            return False
-    if all_equal:
-        return True
-
-from dataclasses import dataclass
-from etl.epc.ValidationConfiguration import EPCRecordValidationConfiguration
-from typing import Union, List
-
-@dataclass
-class EPCRecord:
-    """
-    Base class for a EPC record
-    """
-    UPRN: str
-    WALLS_DESCRIPTION: str
-    FLOOR_DESCRIPTION: str
-    LIGHTING_DESCRIPTION: str
-    ROOF_DESCRIPTION: str
-    MAINHEAT_DESCRIPTION: str
-    HOTWATER_DESCRIPTION: str
-    MAIN_FUEL: str
-    MECHANICAL_VENTILATION: str
-    SECONDHEAT_DESCRIPTION: str
-    WINDOWS_DESCRIPTION: str
-    GLAZED_TYPE: str
-    MULTI_GLAZE_PROPORTION: float
-    LOW_ENERGY_LIGHTING: float
-    NUMBER_OPEN_FIREPLACES: float
-    MAINHEATCONT_DESCRIPTION: str
-    SOLAR_WATER_HEATING_FLAG: str
-    PHOTO_SUPPLY: float
-    TRANSACTION_TYPE: str
-    ENERGY_TARIFF: str
-    EXTENSION_COUNT: float
-    TOTAL_FLOOR_AREA: float
-    FLOOR_HEIGHT: float
-    HOT_WATER_ENERGY_EFF: str
-    FLOOR_ENERGY_EFF: str
-    WINDOWS_ENERGY_EFF: str 
-    WALLS_ENERGY_EFF: str
-    SHEATING_ENERGY_EFF: str
-    ROOF_ENERGY_EFF: str
-    MAINHEAT_ENERGY_EFF: str
-    MAINHEATC_ENERGY_EFF: str 
-    LIGHTING_ENERGY_EFF: str
-    POTENTIAL_ENERGY_EFFICIENCY: float
-    ENVIRONMENT_IMPACT_POTENTIAL: float
-    ENERGY_CONSUMPTION_POTENTIAL: float 
-    CO2_EMISSIONS_POTENTIAL: float
-    LODGEMENT_DATE: str
-    CURRENT_ENERGY_EFFICIENCY: int
-    ENERGY_CONSUMPTION_CURRENT: int
-    CO2_EMISSIONS_CURRENT: float
-
-    def __post_init__(self):
-        # We can have validation and cleaning steps for each of the fields
-        # self.WALLS_DESCRIPTION = 'check'
-        # Could also have cleaning of records if needed
-        # self._field_validation()
-        pass
-
-    def _field_validation(self):
-        """
-        This method will validate each of the fields in the EPC record
-        """
-        self.validation_configuration = EPCRecordValidationConfiguration
-        
-        for record_key, validation_config in self.validation_configuration.items():
-            # Get the variable named record key from self
-            field_value = self.__dict__[record_key]
-
-            if validation_config['type'] == "string":
-                self._validate_string(record_key, field_value, validation_config)
-            elif validation_config['type'] == "float":
-                self._validate_float(field_value, validation_config)
-            else:
-                raise ValueError(f"Validation type {validation_config['type']} not supported")
-
-    def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict):
-        """
-        Validate a string field
-        """
-        if not isinstance(field_value, str):
-            raise ValueError(f"Field {record_key} has value {field_value} which is not a string")
-        
-        if 'function' in validation_config:
-            try:
-                validation_config['function'](field_value)
-            except:
-                raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
-
-        if validation_config['acceptable_values'] is not None:
-            if field_value not in validation_config['acceptable_values']:
-                raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}")
-    
-    def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict):
-        """
-        Validate a float field
-        """
-        if not isinstance(field_value, float):
-            raise ValueError(f"Field {record_key} has value {field_value} which is not a float")
-        
-        if 'function' in validation_config:
-            try:
-                validation_config['function'](field_value)
-            except:
-                raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
-        
-        if validation_config['range'] is not None:
-            if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]:
-                raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}")    
-    
-    def __sub__(self, other):
-        """
-        This method will return the difference between two EPC records
-        """
-        if not isinstance(other, EPCRecord):
-            raise ValueError("Can only subtract EPCRecord from EPCRecord")
-        
-        difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True)
-        
-        return difference_record
-    
-    def __gt__(self, other):
-        """
-        This method will return True if the EPC record is greater than or equal to the other
-        """
-        if not isinstance(other, EPCRecord):
-            raise ValueError("Can only compare EPCRecord to EPCRecord")
-        
-        return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE]
-    
-    def __ge__(self, other):
-        """
-        This method will return True if the EPC record is greater than or equal to the other
-        """
-        if not isinstance(other, EPCRecord):
-            raise ValueError("Can only compare EPCRecord to EPCRecord")
-        
-        return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE]
-        
-    def __lt__(self, other):
-        """
-        This method will return True if the EPC record is greater than or equal to the other
-        """
-        if not isinstance(other, EPCRecord):
-            raise ValueError("Can only compare EPCRecord to EPCRecord")
-        
-        return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE]
-    
-    def __le__(self, other):
-        """
-        This method will return True if the EPC record is greater than or equal to the other
-        """
-        if not isinstance(other, EPCRecord):
-            raise ValueError("Can only compare EPCRecord to EPCRecord")
-        
-        return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
-        
-    def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None):
-        """
-        This method will return the value of the key
-        """
-        if return_asdict:
-            output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key}
-            if key_suffix is not None:
-                output_dict = {f"{x}_{key_suffix}": y for x, y in output_dict.items()}
-            return output_dict
-
-        if isinstance(key, list):
-            return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key]
-        elif isinstance(key, str):
-            return self.__dict__[key] if key in self.__dict__.keys() else None    
-        
-
-class EPCDifferenceRecord:
-    """
-    Base class for the difference between two EPC records
-    """
-
-    def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False):
-        """
-        This method will initialise the EPCDifferenceRecord
-        Defaults usage is with record2 to have the higher RDSAP score
-        """
-        self.record1 = record1
-        self.record2 = record2
-        self.difference_record = {}
-
-        if auto_sort and (self.record2 <= self.record1):
-            self.record1, self.record2 = self.record2, self.record1
-
-        self._construct_difference_record()
-        self._validate_difference_record()
-
-    def _construct_difference_record(self):
-        """
-        This method will construct the difference record between the two records
-        """
-
-        rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE)
-        heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE)
-        carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE)
-
-        component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
-        ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING")
-        starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING")
-
-        # TODO: DO we want to take the earliest potentials or max potentials?
-        self.difference_record = {
-            "UPRN": self.record1.get("UPRN"),
-            "RDSAP_CHANGE": rdsap_change,
-            "HEAT_DEMAND_CHANGE": heat_demand_change,
-            "CARBON_CHANGE": carbon_change,
-            "SAP_STARTING": self.record1.get(RDSAP_RESPONSE),
-            "SAP_ENDING": self.record2.get(RDSAP_RESPONSE),
-            "HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE),
-            "HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE),
-            "CARBON_STARTING": self.record1.get(CARBON_RESPONSE),
-            "CARBON_ENDING": self.record2.get(CARBON_RESPONSE),
-            "POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")),
-            "ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")),
-            "ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")),
-            "CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")),
-            **ending_record,
-            **starting_record
-        }
-
-    def _validate_difference_record(self):
-        """
-        This method will validate the difference record
-        """
-        # for key, value in self.difference_record.items():
-        #     if key == "LODGEMENT_DATE":
-        #         continue
-        #     if isinstance(value, str):
-        #         continue
-        #     if value < 0:
-        #         raise ValueError(f"Difference record has negative value for {key}")
-        pass
-            
-    def compare_fields_in_records(self, fields: List[str]):
-        """
-        This method will compare the records, for specific fields
-        """
-        
-        all_equal = True
-        for field in fields:
-            if self.record1.get(field) != self.record2.get(field):
-                return False
-    
-        if all_equal:
-            return True
-            
-    def get(self, key: str):
-        """
-        This method will return the value of the key
-        """
-        return self.difference_record[key] if key in self.difference_record.keys() else None    
-
-    def append_fixed_data(self, fixed_data: dict):
-        """
-        This method will append fixed data to the difference record
-        """
-        self.difference_record.update(fixed_data)
+#     all_equal = True
+#     for col in columns:
+#         if earliest_record[col] != latest_record[col]:
+#             return False
+#     if all_equal:
+#         return True

 def app():
    # Get all the files in the directory
@ -686,9 +426,9 @@ def app():

        cleaning_dataset.append(data_processor.cleaning_averages)

-        data_by_urpn = []
+        data_by_uprn = []
        for uprn, property_data in df.groupby("UPRN", observed=True):
-           asdasd 
+            
            # Fixed features - these are property attributes that shouldn't change over time
            fixed_data = {}

@ -748,102 +488,109 @@ def app():

                difference_record.append_fixed_data(fixed_data)

-                property_model_data.append(difference_record.difference_record)
+                property_model_data.append(difference_record)

-            for idx in range(0, property_data.shape[0] - 1):
+                # property_model_data.append(difference_record.difference_record)
+
+            # for idx in range(0, property_data.shape[0] - 1):
                
-                if idx >= property_data.shape[0] - 1:
-                    break
+            #     if idx >= property_data.shape[0] - 1:
+            #         break

-                earliest_record = variable_data.iloc[idx]
-                latest_record = variable_data.iloc[idx + 1]
+            #     earliest_record = variable_data.iloc[idx]
+            #     latest_record = variable_data.iloc[idx + 1]

-                # Check if the sap gets better or worse
-                gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
+            #     # Check if the sap gets better or worse
+            #     gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]

-                component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
+            #     component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES

-                if gets_better:
-                    starting_sap = earliest_record[RDSAP_RESPONSE]
-                    starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
-                    starting_carbon = earliest_record[CARBON_RESPONSE]
+            #     if gets_better:
+            #         starting_sap = earliest_record[RDSAP_RESPONSE]
+            #         starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
+            #         starting_carbon = earliest_record[CARBON_RESPONSE]

-                    ending_sap = latest_record[RDSAP_RESPONSE]
-                    ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
-                    ending_carbon = latest_record[CARBON_RESPONSE]
+            #         ending_sap = latest_record[RDSAP_RESPONSE]
+            #         ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
+            #         ending_carbon = latest_record[CARBON_RESPONSE]

-                    rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
-                    heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
-                    carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
+            #         rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
+            #         heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+            #         carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon

-                    starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
-                    ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
-                else:
-                    starting_sap = latest_record[RDSAP_RESPONSE]
-                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
-                    starting_carbon = latest_record[CARBON_RESPONSE]
+            #         starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
+            #         ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
+            #     else:
+            #         starting_sap = latest_record[RDSAP_RESPONSE]
+            #         starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
+            #         starting_carbon = latest_record[CARBON_RESPONSE]

-                    ending_sap = earliest_record[RDSAP_RESPONSE]
-                    ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
-                    ending_carbon = earliest_record[CARBON_RESPONSE]
+            #         ending_sap = earliest_record[RDSAP_RESPONSE]
+            #         ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
+            #         ending_carbon = earliest_record[CARBON_RESPONSE]

-                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
-                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
-                    carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
+            #         rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
+            #         heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+            #         carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon

-                    starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
-                    ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
+            #         starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
+            #         ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")

-                if rdsap_change == 0:
-                    continue
+            #     if rdsap_change == 0:
+            #         continue

-                all_equal = compare_records(
-                    earliest_record=earliest_record,
-                    latest_record=latest_record,
-                    columns=CORE_COMPONENT_FEATURES
-                )
+            #     all_equal = compare_records(
+            #         earliest_record=earliest_record,
+            #         latest_record=latest_record,
+            #         columns=CORE_COMPONENT_FEATURES
+            #     )

-                if all_equal:
-                    # Keep track of this for the moment so we can analyse
-                    all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
-                    continue
-                asdasd
-                features = pd.concat([starting_record, ending_record])
+            #     if all_equal:
+            #         # Keep track of this for the moment so we can analyse
+            #         all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
+            #         continue
+            #     asdasd
+            #     features = pd.concat([starting_record, ending_record])

-                property_model_data.append(
-                    {
-                        "UPRN": uprn,
-                        "RDSAP_CHANGE": rdsap_change,
-                        "HEAT_DEMAND_CHANGE": heat_demand_change,
-                        "CARBON_CHANGE": carbon_change,
-                        "SAP_STARTING": starting_sap,
-                        "SAP_ENDING": ending_sap,
-                        "HEAT_DEMAND_STARTING": starting_heat_demand,
-                        "HEAT_DEMAND_ENDING": ending_heat_demand,
-                        "CARBON_STARTING": starting_carbon,
-                        "CARBON_ENDING": ending_carbon,
-                        "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
-                        "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
-                        "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
-                        "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
-                        **fixed_data,
-                        **features.to_dict(),
-                    }
-                )
+            #     property_model_data.append(
+            #         {
+            #             "UPRN": uprn,
+            #             "RDSAP_CHANGE": rdsap_change,
+            #             "HEAT_DEMAND_CHANGE": heat_demand_change,
+            #             "CARBON_CHANGE": carbon_change,
+            #             "SAP_STARTING": starting_sap,
+            #             "SAP_ENDING": ending_sap,
+            #             "HEAT_DEMAND_STARTING": starting_heat_demand,
+            #             "HEAT_DEMAND_ENDING": ending_heat_demand,
+            #             "CARBON_STARTING": starting_carbon,
+            #             "CARBON_ENDING": ending_carbon,
+            #             "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
+            #             "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
+            #             "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
+            #             "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
+            #             **fixed_data,
+            #             **features.to_dict(),
+            #         }
+            #     )

-            data_by_urpn.extend(property_model_data)
+            # data_by_urpn.extend(property_model_data)
+            data_by_uprn.extend(property_model_data)
+
+        from etl.epc.Dataset import TrainingDataset
+        constituency_data = TrainingDataset(datasets=data_by_uprn)

        data_by_urpn_df = pd.DataFrame(data_by_urpn)

-        data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
-            data_by_urpn_df["LODGEMENT_DATE_STARTING"]
-        )
+        # # TODO: can we move this into the epc record?
+        # data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
+        #     data_by_urpn_df["LODGEMENT_DATE_STARTING"]
+        # )

-        data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
-            data_by_urpn_df["LODGEMENT_DATE_ENDING"]
-        )
+        # data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
+        #     data_by_urpn_df["LODGEMENT_DATE_ENDING"]
+        # )

-        data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
+        # data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])

        data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)

@ -889,6 +636,7 @@ def app():

    output = pd.concat(dataset)

+    # TODO: move into difference record
    # Remove any records that have huge swings in their floor area
    output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
    output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]