diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py new file mode 100644 index 00000000..8379546f --- /dev/null +++ b/etl/epc/Dataset.py @@ -0,0 +1,107 @@ +import pandas as pd +from typing import List +from etl.epc.EPCRecord import EPCDifferenceRecord + +class TrainingDataset: + """ + A collection of EPCDifferenceRecords can be combined into a TrainingDataset. + """ + + def __init__(self, datasets: List[EPCDifferenceRecord]) -> None: + self.datasets = datasets + self.df = pd.DataFrame([dataset.difference_record for dataset in datasets]) + + self._feature_generation() + self._drop_features() + self._clean_dataframe() + self._clean_efficiency_variables(self.df) + + def _drop_features(self): + """ + Drop features that are not needed for modelling + """ + self.df = self.df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"]) + + + def _feature_generation(self): + """ + Generate features for modelling + """ + self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"]) + self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"]) + + @staticmethod + def _clean_efficiency_variables(df): + + """ + These is scope to clean this by the model per corresponding description. + E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and + fill in the missing values with this. + When looking at this initially, there are a large volume of records with missing energy efficiency + values and therefore a simpler approach was taken just to test including these variables + :param df: + :return: + """ + + missings = pd.isnull(df).sum() + missings = missings[missings >= 1] + + if len(missings) == 0: + return df + + # Make sure they are all efficiency columns + if any(~missings.index.str.contains("ENERGY_EFF")): + raise ValueError("Non efficiency columns are missing") + + for m in missings.index: + df[m] = df[m].fillna("NO_RATING") + + return df + + @staticmethod + def _calculate_days_to(lodgement_date): + + if isinstance(lodgement_date, str): + return ( + pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) + ).daye + + return ( + pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) + ).dt.days + + def __add__(self, other) -> "TrainingDataset": + if not isinstance(other, TrainingDataset): + raise TypeError("Addition can only be performed with another instance of TrainingDataset") + return TrainingDataset(self.datasets + other.datasets) + + def __radd__(self, other): + """ + Required for sum() to work + """ + if isinstance(other, int): + return self + else: + return self.__add__(other) + +class ScoringDataset: + """ + A collection of EPCDifferenceRecords can be combined into a ScoringDataset. + """ + + def __init__(self, datasets: List[EPCDifferenceRecord]) -> None: + self.datasets = datasets + + def __add__(self, other) -> "ScoringDataset": + if not isinstance(other, ScoringDataset): + raise TypeError("Addition can only be performed with another instance of ScoringDataset") + return ScoringDataset(self.datasets + other.datasets) + + def __radd__(self, other): + """ + Required for sum() to work + """ + if isinstance(other, int): + return self + else: + return self.__add__(other) \ No newline at end of file diff --git a/etl/epc/EPCRecord.py b/etl/epc/EPCRecord.py new file mode 100644 index 00000000..73e60483 --- /dev/null +++ b/etl/epc/EPCRecord.py @@ -0,0 +1,290 @@ + +from dataclasses import dataclass +from etl.epc.ValidationConfiguration import ( + EPCRecordValidationConfiguration, + EPCDifferenceRecordValidationConfiguration, + EPCDifferenceRecordFixedDataValidationConfiguration +) +from typing import Union, List +from etl.epc.settings import ( + RDSAP_RESPONSE, + HEAT_DEMAND_RESPONSE, + CARBON_RESPONSE, + COMPONENT_FEATURES, + EFFICIENCY_FEATURES +) + +@dataclass +class EPCRecord: + """ + Base class for a EPC record + """ + UPRN: str + WALLS_DESCRIPTION: str + FLOOR_DESCRIPTION: str + LIGHTING_DESCRIPTION: str + ROOF_DESCRIPTION: str + MAINHEAT_DESCRIPTION: str + HOTWATER_DESCRIPTION: str + MAIN_FUEL: str + MECHANICAL_VENTILATION: str + SECONDHEAT_DESCRIPTION: str + WINDOWS_DESCRIPTION: str + GLAZED_TYPE: str + MULTI_GLAZE_PROPORTION: float + LOW_ENERGY_LIGHTING: float + NUMBER_OPEN_FIREPLACES: float + MAINHEATCONT_DESCRIPTION: str + SOLAR_WATER_HEATING_FLAG: str + PHOTO_SUPPLY: float + TRANSACTION_TYPE: str + ENERGY_TARIFF: str + EXTENSION_COUNT: float + TOTAL_FLOOR_AREA: float + FLOOR_HEIGHT: float + HOT_WATER_ENERGY_EFF: str + FLOOR_ENERGY_EFF: str + WINDOWS_ENERGY_EFF: str + WALLS_ENERGY_EFF: str + SHEATING_ENERGY_EFF: str + ROOF_ENERGY_EFF: str + MAINHEAT_ENERGY_EFF: str + MAINHEATC_ENERGY_EFF: str + LIGHTING_ENERGY_EFF: str + POTENTIAL_ENERGY_EFFICIENCY: float + ENVIRONMENT_IMPACT_POTENTIAL: float + ENERGY_CONSUMPTION_POTENTIAL: float + CO2_EMISSIONS_POTENTIAL: float + LODGEMENT_DATE: str + CURRENT_ENERGY_EFFICIENCY: int + ENERGY_CONSUMPTION_CURRENT: int + CO2_EMISSIONS_CURRENT: float + + def __post_init__(self): + # We can have validation and cleaning steps for each of the fields + # self.WALLS_DESCRIPTION = 'check' + # Could also have cleaning of records if needed + self.validation_configuration = EPCRecordValidationConfiguration + + # self._field_validation() + pass + + def _field_validation(self): + """ + This method will validate each of the fields in the EPC record + """ + + for record_key, validation_config in self.validation_configuration.items(): + # Get the variable named record key from self + field_value = self.__dict__[record_key] + + if validation_config['type'] == "string": + self._validate_string(record_key, field_value, validation_config) + elif validation_config['type'] == "float": + self._validate_float(field_value, validation_config) + else: + raise ValueError(f"Validation type {validation_config['type']} not supported") + + def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict): + """ + Validate a string field + """ + if not isinstance(field_value, str): + raise ValueError(f"Field {record_key} has value {field_value} which is not a string") + + if 'function' in validation_config: + try: + validation_config['function'](field_value) + except: + raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}") + + if validation_config['acceptable_values'] is not None: + if field_value not in validation_config['acceptable_values']: + raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}") + + def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict): + """ + Validate a float field + """ + if not isinstance(field_value, float): + raise ValueError(f"Field {record_key} has value {field_value} which is not a float") + + if 'function' in validation_config: + try: + validation_config['function'](field_value) + except: + raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}") + + if validation_config['range'] is not None: + if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]: + raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}") + + def __sub__(self, other): + """ + This method will return the difference between two EPC records + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only subtract EPCRecord from EPCRecord") + + difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True) + + return difference_record + + def __gt__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE] + + def __ge__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE] + + def __lt__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE] + + def __le__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE] + + def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None): + """ + This method will return the value of the key + """ + if return_asdict: + output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key} + if key_suffix is not None: + output_dict = {f"{x}{key_suffix}": y for x, y in output_dict.items()} + return output_dict + + if isinstance(key, list): + return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key] + elif isinstance(key, str): + return self.__dict__[key] if key in self.__dict__.keys() else None + + +class EPCDifferenceRecord: + """ + Base class for the difference between two EPC records + """ + + def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False): + """ + This method will initialise the EPCDifferenceRecord + Defaults usage is with record2 to have the higher RDSAP score + """ + self.record1 = record1 + self.record2 = record2 + self.difference_record = {} + + self.difference_validation_configuration = EPCDifferenceRecordValidationConfiguration + self.fixed_data_validation_configuration = EPCDifferenceRecordFixedDataValidationConfiguration + + if auto_sort and (self.record2 <= self.record1): + self.record1, self.record2 = self.record2, self.record1 + + self._construct_difference_record() + self._validate_difference_record() + + + + def _construct_difference_record(self): + """ + This method will construct the difference record between the two records + """ + + rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE) + heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE) + carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE) + + component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING") + starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING") + + # TODO: DO we want to take the earliest potentials or max potentials? + self.difference_record = { + "UPRN": self.record1.get("UPRN"), + "RDSAP_CHANGE": rdsap_change, + "HEAT_DEMAND_CHANGE": heat_demand_change, + "CARBON_CHANGE": carbon_change, + "SAP_STARTING": self.record1.get(RDSAP_RESPONSE), + "SAP_ENDING": self.record2.get(RDSAP_RESPONSE), + "HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE), + "HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE), + "CARBON_STARTING": self.record1.get(CARBON_RESPONSE), + "CARBON_ENDING": self.record2.get(CARBON_RESPONSE), + "POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")), + "ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")), + "ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")), + "CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")), + **ending_record, + **starting_record + } + + def _validate_difference_record(self): + """ + This method will validate the difference record + """ + # for key, value in self.difference_record.items(): + # if key == "LODGEMENT_DATE": + # continue + # if isinstance(value, str): + # continue + # if value < 0: + # raise ValueError(f"Difference record has negative value for {key}") + pass + + def compare_fields_in_records(self, fields: List[str]): + """ + This method will compare the records, for specific fields + """ + + all_equal = True + for field in fields: + if self.record1.get(field) != self.record2.get(field): + return False + + if all_equal: + return True + + def get(self, key: str): + """ + This method will return the value of the key + """ + return self.difference_record[key] if key in self.difference_record.keys() else None + + def append_fixed_data(self, fixed_data: dict): + """ + This method will append fixed data to the difference record + """ + self._validate_fixed_data(fixed_data) + self.difference_record.update(fixed_data) + + def _validate_fixed_data(self, fixed_data: dict): + """ + This method will validate the fixed data + """ + + # Can have more sophisticated checks here + # self.fixed_data_validataion_configuration + + pass diff --git a/etl/epc/ValidationConfiguration.py b/etl/epc/ValidationConfiguration.py index 5d9c910d..fdca024a 100644 --- a/etl/epc/ValidationConfiguration.py +++ b/etl/epc/ValidationConfiguration.py @@ -21,3 +21,37 @@ EPCRecordValidationConfiguration = { "range": [0, 100] } } + +EPCDifferenceRecordValidationConfiguration = { +} + +EPCDifferenceRecordFixedDataValidationConfiguration = { + "PROPERTY_TYPE": { + "type": "string", + "acceptable_values": ["House", "Flat", "Bungalow", "Maisonette", "Park home", "Other"] + }, + "BUILT_FORM": { + "type": "string", + "acceptable_values": ["Detached", "Semi-Detached", "End-Terrace", "Mid-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace", "Enclosed Detached", "Not applicable"] + }, + "CONSITUENCY": { + "type": "string", + "acceptable_values": ["England", "Wales", "Scotland", "Northern Ireland"] + }, + "NUMBER_HABITABLE_ROOMS": { + "type": "integer", + "range": [0, 100] + }, + "NUMBER_HEATED_ROOMS": { + "type": "integer", + "range": [0, 100] + }, + "FIXED_LIGHTING_OUTLETS_COUNT": { + "type": "integer", + "range": [0, 100] + }, + "CONSTRUCTION_AGE_BAND": { + "type": "string", + "acceptable_values": [] + } +} \ No newline at end of file diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index ecc79ba3..da17fe05 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -18,6 +18,7 @@ from etl.epc.settings import ( MINIMUM_FLOOR_HEIGHT ) from etl.epc.DataProcessor import DataProcessor +from etl.epc.EPCRecord import EPCRecord, EPCDifferenceRecord from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3 from recommendations.rdsap_tables import england_wales_age_band_lookup from recommendations.recommendation_utils import ( @@ -223,6 +224,7 @@ def make_uvalues(df): df["row_index"] = df.index uvalues = [] + # TODO: iterrows is the slowest way to do this, we should use a vectorised approach or itertuples for _, x in df.iterrows(): uprn = x["UPRN"] @@ -379,285 +381,23 @@ def make_uvalues(df): return df -def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list): - """ - For a list of columns, check if the earliest and latest record are the same - If they are the same, we indicate this, because we have example of SAP scores changing - without any feature changes - :param earliest_record: pd.Series - :param latest_record: pd.Series - :param columns: list of columns to compare - :return: boolean indicating whether or not all features are the same - """ +# def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list): +# """ +# For a list of columns, check if the earliest and latest record are the same +# If they are the same, we indicate this, because we have example of SAP scores changing +# without any feature changes +# :param earliest_record: pd.Series +# :param latest_record: pd.Series +# :param columns: list of columns to compare +# :return: boolean indicating whether or not all features are the same +# """ - all_equal = True - for col in columns: - if earliest_record[col] != latest_record[col]: - return False - if all_equal: - return True - -from dataclasses import dataclass -from etl.epc.ValidationConfiguration import EPCRecordValidationConfiguration -from typing import Union, List - -@dataclass -class EPCRecord: - """ - Base class for a EPC record - """ - UPRN: str - WALLS_DESCRIPTION: str - FLOOR_DESCRIPTION: str - LIGHTING_DESCRIPTION: str - ROOF_DESCRIPTION: str - MAINHEAT_DESCRIPTION: str - HOTWATER_DESCRIPTION: str - MAIN_FUEL: str - MECHANICAL_VENTILATION: str - SECONDHEAT_DESCRIPTION: str - WINDOWS_DESCRIPTION: str - GLAZED_TYPE: str - MULTI_GLAZE_PROPORTION: float - LOW_ENERGY_LIGHTING: float - NUMBER_OPEN_FIREPLACES: float - MAINHEATCONT_DESCRIPTION: str - SOLAR_WATER_HEATING_FLAG: str - PHOTO_SUPPLY: float - TRANSACTION_TYPE: str - ENERGY_TARIFF: str - EXTENSION_COUNT: float - TOTAL_FLOOR_AREA: float - FLOOR_HEIGHT: float - HOT_WATER_ENERGY_EFF: str - FLOOR_ENERGY_EFF: str - WINDOWS_ENERGY_EFF: str - WALLS_ENERGY_EFF: str - SHEATING_ENERGY_EFF: str - ROOF_ENERGY_EFF: str - MAINHEAT_ENERGY_EFF: str - MAINHEATC_ENERGY_EFF: str - LIGHTING_ENERGY_EFF: str - POTENTIAL_ENERGY_EFFICIENCY: float - ENVIRONMENT_IMPACT_POTENTIAL: float - ENERGY_CONSUMPTION_POTENTIAL: float - CO2_EMISSIONS_POTENTIAL: float - LODGEMENT_DATE: str - CURRENT_ENERGY_EFFICIENCY: int - ENERGY_CONSUMPTION_CURRENT: int - CO2_EMISSIONS_CURRENT: float - - def __post_init__(self): - # We can have validation and cleaning steps for each of the fields - # self.WALLS_DESCRIPTION = 'check' - # Could also have cleaning of records if needed - # self._field_validation() - pass - - def _field_validation(self): - """ - This method will validate each of the fields in the EPC record - """ - self.validation_configuration = EPCRecordValidationConfiguration - - for record_key, validation_config in self.validation_configuration.items(): - # Get the variable named record key from self - field_value = self.__dict__[record_key] - - if validation_config['type'] == "string": - self._validate_string(record_key, field_value, validation_config) - elif validation_config['type'] == "float": - self._validate_float(field_value, validation_config) - else: - raise ValueError(f"Validation type {validation_config['type']} not supported") - - def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict): - """ - Validate a string field - """ - if not isinstance(field_value, str): - raise ValueError(f"Field {record_key} has value {field_value} which is not a string") - - if 'function' in validation_config: - try: - validation_config['function'](field_value) - except: - raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}") - - if validation_config['acceptable_values'] is not None: - if field_value not in validation_config['acceptable_values']: - raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}") - - def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict): - """ - Validate a float field - """ - if not isinstance(field_value, float): - raise ValueError(f"Field {record_key} has value {field_value} which is not a float") - - if 'function' in validation_config: - try: - validation_config['function'](field_value) - except: - raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}") - - if validation_config['range'] is not None: - if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]: - raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}") - - def __sub__(self, other): - """ - This method will return the difference between two EPC records - """ - if not isinstance(other, EPCRecord): - raise ValueError("Can only subtract EPCRecord from EPCRecord") - - difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True) - - return difference_record - - def __gt__(self, other): - """ - This method will return True if the EPC record is greater than or equal to the other - """ - if not isinstance(other, EPCRecord): - raise ValueError("Can only compare EPCRecord to EPCRecord") - - return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE] - - def __ge__(self, other): - """ - This method will return True if the EPC record is greater than or equal to the other - """ - if not isinstance(other, EPCRecord): - raise ValueError("Can only compare EPCRecord to EPCRecord") - - return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE] - - def __lt__(self, other): - """ - This method will return True if the EPC record is greater than or equal to the other - """ - if not isinstance(other, EPCRecord): - raise ValueError("Can only compare EPCRecord to EPCRecord") - - return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE] - - def __le__(self, other): - """ - This method will return True if the EPC record is greater than or equal to the other - """ - if not isinstance(other, EPCRecord): - raise ValueError("Can only compare EPCRecord to EPCRecord") - - return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE] - - def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None): - """ - This method will return the value of the key - """ - if return_asdict: - output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key} - if key_suffix is not None: - output_dict = {f"{x}_{key_suffix}": y for x, y in output_dict.items()} - return output_dict - - if isinstance(key, list): - return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key] - elif isinstance(key, str): - return self.__dict__[key] if key in self.__dict__.keys() else None - - -class EPCDifferenceRecord: - """ - Base class for the difference between two EPC records - """ - - def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False): - """ - This method will initialise the EPCDifferenceRecord - Defaults usage is with record2 to have the higher RDSAP score - """ - self.record1 = record1 - self.record2 = record2 - self.difference_record = {} - - if auto_sort and (self.record2 <= self.record1): - self.record1, self.record2 = self.record2, self.record1 - - self._construct_difference_record() - self._validate_difference_record() - - def _construct_difference_record(self): - """ - This method will construct the difference record between the two records - """ - - rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE) - heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE) - carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE) - - component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES - ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING") - starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING") - - # TODO: DO we want to take the earliest potentials or max potentials? - self.difference_record = { - "UPRN": self.record1.get("UPRN"), - "RDSAP_CHANGE": rdsap_change, - "HEAT_DEMAND_CHANGE": heat_demand_change, - "CARBON_CHANGE": carbon_change, - "SAP_STARTING": self.record1.get(RDSAP_RESPONSE), - "SAP_ENDING": self.record2.get(RDSAP_RESPONSE), - "HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE), - "HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE), - "CARBON_STARTING": self.record1.get(CARBON_RESPONSE), - "CARBON_ENDING": self.record2.get(CARBON_RESPONSE), - "POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")), - "ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")), - "ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")), - "CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")), - **ending_record, - **starting_record - } - - def _validate_difference_record(self): - """ - This method will validate the difference record - """ - # for key, value in self.difference_record.items(): - # if key == "LODGEMENT_DATE": - # continue - # if isinstance(value, str): - # continue - # if value < 0: - # raise ValueError(f"Difference record has negative value for {key}") - pass - - def compare_fields_in_records(self, fields: List[str]): - """ - This method will compare the records, for specific fields - """ - - all_equal = True - for field in fields: - if self.record1.get(field) != self.record2.get(field): - return False - - if all_equal: - return True - - def get(self, key: str): - """ - This method will return the value of the key - """ - return self.difference_record[key] if key in self.difference_record.keys() else None - - def append_fixed_data(self, fixed_data: dict): - """ - This method will append fixed data to the difference record - """ - self.difference_record.update(fixed_data) +# all_equal = True +# for col in columns: +# if earliest_record[col] != latest_record[col]: +# return False +# if all_equal: +# return True def app(): # Get all the files in the directory @@ -686,9 +426,9 @@ def app(): cleaning_dataset.append(data_processor.cleaning_averages) - data_by_urpn = [] + data_by_uprn = [] for uprn, property_data in df.groupby("UPRN", observed=True): - asdasd + # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} @@ -748,102 +488,109 @@ def app(): difference_record.append_fixed_data(fixed_data) - property_model_data.append(difference_record.difference_record) + property_model_data.append(difference_record) - for idx in range(0, property_data.shape[0] - 1): + # property_model_data.append(difference_record.difference_record) + + # for idx in range(0, property_data.shape[0] - 1): - if idx >= property_data.shape[0] - 1: - break + # if idx >= property_data.shape[0] - 1: + # break - earliest_record = variable_data.iloc[idx] - latest_record = variable_data.iloc[idx + 1] + # earliest_record = variable_data.iloc[idx] + # latest_record = variable_data.iloc[idx + 1] - # Check if the sap gets better or worse - gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE] + # # Check if the sap gets better or worse + # gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE] - component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + # component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES - if gets_better: - starting_sap = earliest_record[RDSAP_RESPONSE] - starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE] - starting_carbon = earliest_record[CARBON_RESPONSE] + # if gets_better: + # starting_sap = earliest_record[RDSAP_RESPONSE] + # starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE] + # starting_carbon = earliest_record[CARBON_RESPONSE] - ending_sap = latest_record[RDSAP_RESPONSE] - ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE] - ending_carbon = latest_record[CARBON_RESPONSE] + # ending_sap = latest_record[RDSAP_RESPONSE] + # ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE] + # ending_carbon = latest_record[CARBON_RESPONSE] - rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap - heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand - carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon + # rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap + # heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand + # carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon - starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING") - ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING") - else: - starting_sap = latest_record[RDSAP_RESPONSE] - starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE] - starting_carbon = latest_record[CARBON_RESPONSE] + # starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING") + # ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING") + # else: + # starting_sap = latest_record[RDSAP_RESPONSE] + # starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE] + # starting_carbon = latest_record[CARBON_RESPONSE] - ending_sap = earliest_record[RDSAP_RESPONSE] - ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE] - ending_carbon = earliest_record[CARBON_RESPONSE] + # ending_sap = earliest_record[RDSAP_RESPONSE] + # ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE] + # ending_carbon = earliest_record[CARBON_RESPONSE] - rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap - heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand - carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon + # rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap + # heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand + # carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon - starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING") - ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING") + # starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING") + # ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING") - if rdsap_change == 0: - continue + # if rdsap_change == 0: + # continue - all_equal = compare_records( - earliest_record=earliest_record, - latest_record=latest_record, - columns=CORE_COMPONENT_FEATURES - ) + # all_equal = compare_records( + # earliest_record=earliest_record, + # latest_record=latest_record, + # columns=CORE_COMPONENT_FEATURES + # ) - if all_equal: - # Keep track of this for the moment so we can analyse - all_equal_rows.append({"uprn": uprn, "directory_name": directory.name}) - continue - asdasd - features = pd.concat([starting_record, ending_record]) + # if all_equal: + # # Keep track of this for the moment so we can analyse + # all_equal_rows.append({"uprn": uprn, "directory_name": directory.name}) + # continue + # asdasd + # features = pd.concat([starting_record, ending_record]) - property_model_data.append( - { - "UPRN": uprn, - "RDSAP_CHANGE": rdsap_change, - "HEAT_DEMAND_CHANGE": heat_demand_change, - "CARBON_CHANGE": carbon_change, - "SAP_STARTING": starting_sap, - "SAP_ENDING": ending_sap, - "HEAT_DEMAND_STARTING": starting_heat_demand, - "HEAT_DEMAND_ENDING": ending_heat_demand, - "CARBON_STARTING": starting_carbon, - "CARBON_ENDING": ending_carbon, - "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"], - "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"], - "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"], - "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"], - **fixed_data, - **features.to_dict(), - } - ) + # property_model_data.append( + # { + # "UPRN": uprn, + # "RDSAP_CHANGE": rdsap_change, + # "HEAT_DEMAND_CHANGE": heat_demand_change, + # "CARBON_CHANGE": carbon_change, + # "SAP_STARTING": starting_sap, + # "SAP_ENDING": ending_sap, + # "HEAT_DEMAND_STARTING": starting_heat_demand, + # "HEAT_DEMAND_ENDING": ending_heat_demand, + # "CARBON_STARTING": starting_carbon, + # "CARBON_ENDING": ending_carbon, + # "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"], + # "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"], + # "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"], + # "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"], + # **fixed_data, + # **features.to_dict(), + # } + # ) - data_by_urpn.extend(property_model_data) + # data_by_urpn.extend(property_model_data) + data_by_uprn.extend(property_model_data) + + from etl.epc.Dataset import TrainingDataset + constituency_data = TrainingDataset(datasets=data_by_uprn) data_by_urpn_df = pd.DataFrame(data_by_urpn) - data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to( - data_by_urpn_df["LODGEMENT_DATE_STARTING"] - ) + # # TODO: can we move this into the epc record? + # data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to( + # data_by_urpn_df["LODGEMENT_DATE_STARTING"] + # ) - data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to( - data_by_urpn_df["LODGEMENT_DATE_ENDING"] - ) + # data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to( + # data_by_urpn_df["LODGEMENT_DATE_ENDING"] + # ) - data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"]) + # data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"]) data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df) @@ -889,6 +636,7 @@ def app(): output = pd.concat(dataset) + # TODO: move into difference record # Remove any records that have huge swings in their floor area output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"]) output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]