breakout classes

This commit is contained in:
Michael Duong 2023-12-06 17:08:05 +00:00
parent 05c01c1770
commit 353e8a90db
4 changed files with 533 additions and 354 deletions

107
etl/epc/Dataset.py Normal file
View file

@ -0,0 +1,107 @@
import pandas as pd
from typing import List
from etl.epc.EPCRecord import EPCDifferenceRecord
class TrainingDataset:
"""
A collection of EPCDifferenceRecords can be combined into a TrainingDataset.
"""
def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
self.datasets = datasets
self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
self._feature_generation()
self._drop_features()
self._clean_dataframe()
self._clean_efficiency_variables(self.df)
def _drop_features(self):
"""
Drop features that are not needed for modelling
"""
self.df = self.df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
def _feature_generation(self):
"""
Generate features for modelling
"""
self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"])
self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"])
@staticmethod
def _clean_efficiency_variables(df):
"""
These is scope to clean this by the model per corresponding description.
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
fill in the missing values with this.
When looking at this initially, there are a large volume of records with missing energy efficiency
values and therefore a simpler approach was taken just to test including these variables
:param df:
:return:
"""
missings = pd.isnull(df).sum()
missings = missings[missings >= 1]
if len(missings) == 0:
return df
# Make sure they are all efficiency columns
if any(~missings.index.str.contains("ENERGY_EFF")):
raise ValueError("Non efficiency columns are missing")
for m in missings.index:
df[m] = df[m].fillna("NO_RATING")
return df
@staticmethod
def _calculate_days_to(lodgement_date):
if isinstance(lodgement_date, str):
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
).daye
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
).dt.days
def __add__(self, other) -> "TrainingDataset":
if not isinstance(other, TrainingDataset):
raise TypeError("Addition can only be performed with another instance of TrainingDataset")
return TrainingDataset(self.datasets + other.datasets)
def __radd__(self, other):
"""
Required for sum() to work
"""
if isinstance(other, int):
return self
else:
return self.__add__(other)
class ScoringDataset:
"""
A collection of EPCDifferenceRecords can be combined into a ScoringDataset.
"""
def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
self.datasets = datasets
def __add__(self, other) -> "ScoringDataset":
if not isinstance(other, ScoringDataset):
raise TypeError("Addition can only be performed with another instance of ScoringDataset")
return ScoringDataset(self.datasets + other.datasets)
def __radd__(self, other):
"""
Required for sum() to work
"""
if isinstance(other, int):
return self
else:
return self.__add__(other)

290
etl/epc/EPCRecord.py Normal file
View file

@ -0,0 +1,290 @@
from dataclasses import dataclass
from etl.epc.ValidationConfiguration import (
EPCRecordValidationConfiguration,
EPCDifferenceRecordValidationConfiguration,
EPCDifferenceRecordFixedDataValidationConfiguration
)
from typing import Union, List
from etl.epc.settings import (
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
CARBON_RESPONSE,
COMPONENT_FEATURES,
EFFICIENCY_FEATURES
)
@dataclass
class EPCRecord:
"""
Base class for a EPC record
"""
UPRN: str
WALLS_DESCRIPTION: str
FLOOR_DESCRIPTION: str
LIGHTING_DESCRIPTION: str
ROOF_DESCRIPTION: str
MAINHEAT_DESCRIPTION: str
HOTWATER_DESCRIPTION: str
MAIN_FUEL: str
MECHANICAL_VENTILATION: str
SECONDHEAT_DESCRIPTION: str
WINDOWS_DESCRIPTION: str
GLAZED_TYPE: str
MULTI_GLAZE_PROPORTION: float
LOW_ENERGY_LIGHTING: float
NUMBER_OPEN_FIREPLACES: float
MAINHEATCONT_DESCRIPTION: str
SOLAR_WATER_HEATING_FLAG: str
PHOTO_SUPPLY: float
TRANSACTION_TYPE: str
ENERGY_TARIFF: str
EXTENSION_COUNT: float
TOTAL_FLOOR_AREA: float
FLOOR_HEIGHT: float
HOT_WATER_ENERGY_EFF: str
FLOOR_ENERGY_EFF: str
WINDOWS_ENERGY_EFF: str
WALLS_ENERGY_EFF: str
SHEATING_ENERGY_EFF: str
ROOF_ENERGY_EFF: str
MAINHEAT_ENERGY_EFF: str
MAINHEATC_ENERGY_EFF: str
LIGHTING_ENERGY_EFF: str
POTENTIAL_ENERGY_EFFICIENCY: float
ENVIRONMENT_IMPACT_POTENTIAL: float
ENERGY_CONSUMPTION_POTENTIAL: float
CO2_EMISSIONS_POTENTIAL: float
LODGEMENT_DATE: str
CURRENT_ENERGY_EFFICIENCY: int
ENERGY_CONSUMPTION_CURRENT: int
CO2_EMISSIONS_CURRENT: float
def __post_init__(self):
# We can have validation and cleaning steps for each of the fields
# self.WALLS_DESCRIPTION = 'check'
# Could also have cleaning of records if needed
self.validation_configuration = EPCRecordValidationConfiguration
# self._field_validation()
pass
def _field_validation(self):
"""
This method will validate each of the fields in the EPC record
"""
for record_key, validation_config in self.validation_configuration.items():
# Get the variable named record key from self
field_value = self.__dict__[record_key]
if validation_config['type'] == "string":
self._validate_string(record_key, field_value, validation_config)
elif validation_config['type'] == "float":
self._validate_float(field_value, validation_config)
else:
raise ValueError(f"Validation type {validation_config['type']} not supported")
def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict):
"""
Validate a string field
"""
if not isinstance(field_value, str):
raise ValueError(f"Field {record_key} has value {field_value} which is not a string")
if 'function' in validation_config:
try:
validation_config['function'](field_value)
except:
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
if validation_config['acceptable_values'] is not None:
if field_value not in validation_config['acceptable_values']:
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}")
def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict):
"""
Validate a float field
"""
if not isinstance(field_value, float):
raise ValueError(f"Field {record_key} has value {field_value} which is not a float")
if 'function' in validation_config:
try:
validation_config['function'](field_value)
except:
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
if validation_config['range'] is not None:
if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]:
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}")
def __sub__(self, other):
"""
This method will return the difference between two EPC records
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only subtract EPCRecord from EPCRecord")
difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True)
return difference_record
def __gt__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE]
def __ge__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE]
def __lt__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE]
def __le__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None):
"""
This method will return the value of the key
"""
if return_asdict:
output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key}
if key_suffix is not None:
output_dict = {f"{x}{key_suffix}": y for x, y in output_dict.items()}
return output_dict
if isinstance(key, list):
return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key]
elif isinstance(key, str):
return self.__dict__[key] if key in self.__dict__.keys() else None
class EPCDifferenceRecord:
"""
Base class for the difference between two EPC records
"""
def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False):
"""
This method will initialise the EPCDifferenceRecord
Defaults usage is with record2 to have the higher RDSAP score
"""
self.record1 = record1
self.record2 = record2
self.difference_record = {}
self.difference_validation_configuration = EPCDifferenceRecordValidationConfiguration
self.fixed_data_validation_configuration = EPCDifferenceRecordFixedDataValidationConfiguration
if auto_sort and (self.record2 <= self.record1):
self.record1, self.record2 = self.record2, self.record1
self._construct_difference_record()
self._validate_difference_record()
def _construct_difference_record(self):
"""
This method will construct the difference record between the two records
"""
rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE)
heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE)
carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE)
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING")
starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING")
# TODO: DO we want to take the earliest potentials or max potentials?
self.difference_record = {
"UPRN": self.record1.get("UPRN"),
"RDSAP_CHANGE": rdsap_change,
"HEAT_DEMAND_CHANGE": heat_demand_change,
"CARBON_CHANGE": carbon_change,
"SAP_STARTING": self.record1.get(RDSAP_RESPONSE),
"SAP_ENDING": self.record2.get(RDSAP_RESPONSE),
"HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE),
"HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE),
"CARBON_STARTING": self.record1.get(CARBON_RESPONSE),
"CARBON_ENDING": self.record2.get(CARBON_RESPONSE),
"POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")),
"ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")),
"ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")),
"CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")),
**ending_record,
**starting_record
}
def _validate_difference_record(self):
"""
This method will validate the difference record
"""
# for key, value in self.difference_record.items():
# if key == "LODGEMENT_DATE":
# continue
# if isinstance(value, str):
# continue
# if value < 0:
# raise ValueError(f"Difference record has negative value for {key}")
pass
def compare_fields_in_records(self, fields: List[str]):
"""
This method will compare the records, for specific fields
"""
all_equal = True
for field in fields:
if self.record1.get(field) != self.record2.get(field):
return False
if all_equal:
return True
def get(self, key: str):
"""
This method will return the value of the key
"""
return self.difference_record[key] if key in self.difference_record.keys() else None
def append_fixed_data(self, fixed_data: dict):
"""
This method will append fixed data to the difference record
"""
self._validate_fixed_data(fixed_data)
self.difference_record.update(fixed_data)
def _validate_fixed_data(self, fixed_data: dict):
"""
This method will validate the fixed data
"""
# Can have more sophisticated checks here
# self.fixed_data_validataion_configuration
pass

View file

@ -21,3 +21,37 @@ EPCRecordValidationConfiguration = {
"range": [0, 100]
}
}
EPCDifferenceRecordValidationConfiguration = {
}
EPCDifferenceRecordFixedDataValidationConfiguration = {
"PROPERTY_TYPE": {
"type": "string",
"acceptable_values": ["House", "Flat", "Bungalow", "Maisonette", "Park home", "Other"]
},
"BUILT_FORM": {
"type": "string",
"acceptable_values": ["Detached", "Semi-Detached", "End-Terrace", "Mid-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace", "Enclosed Detached", "Not applicable"]
},
"CONSITUENCY": {
"type": "string",
"acceptable_values": ["England", "Wales", "Scotland", "Northern Ireland"]
},
"NUMBER_HABITABLE_ROOMS": {
"type": "integer",
"range": [0, 100]
},
"NUMBER_HEATED_ROOMS": {
"type": "integer",
"range": [0, 100]
},
"FIXED_LIGHTING_OUTLETS_COUNT": {
"type": "integer",
"range": [0, 100]
},
"CONSTRUCTION_AGE_BAND": {
"type": "string",
"acceptable_values": []
}
}

View file

@ -18,6 +18,7 @@ from etl.epc.settings import (
MINIMUM_FLOOR_HEIGHT
)
from etl.epc.DataProcessor import DataProcessor
from etl.epc.EPCRecord import EPCRecord, EPCDifferenceRecord
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
from recommendations.rdsap_tables import england_wales_age_band_lookup
from recommendations.recommendation_utils import (
@ -223,6 +224,7 @@ def make_uvalues(df):
df["row_index"] = df.index
uvalues = []
# TODO: iterrows is the slowest way to do this, we should use a vectorised approach or itertuples
for _, x in df.iterrows():
uprn = x["UPRN"]
@ -379,285 +381,23 @@ def make_uvalues(df):
return df
def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
"""
For a list of columns, check if the earliest and latest record are the same
If they are the same, we indicate this, because we have example of SAP scores changing
without any feature changes
:param earliest_record: pd.Series
:param latest_record: pd.Series
:param columns: list of columns to compare
:return: boolean indicating whether or not all features are the same
"""
# def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
# """
# For a list of columns, check if the earliest and latest record are the same
# If they are the same, we indicate this, because we have example of SAP scores changing
# without any feature changes
# :param earliest_record: pd.Series
# :param latest_record: pd.Series
# :param columns: list of columns to compare
# :return: boolean indicating whether or not all features are the same
# """
all_equal = True
for col in columns:
if earliest_record[col] != latest_record[col]:
return False
if all_equal:
return True
from dataclasses import dataclass
from etl.epc.ValidationConfiguration import EPCRecordValidationConfiguration
from typing import Union, List
@dataclass
class EPCRecord:
"""
Base class for a EPC record
"""
UPRN: str
WALLS_DESCRIPTION: str
FLOOR_DESCRIPTION: str
LIGHTING_DESCRIPTION: str
ROOF_DESCRIPTION: str
MAINHEAT_DESCRIPTION: str
HOTWATER_DESCRIPTION: str
MAIN_FUEL: str
MECHANICAL_VENTILATION: str
SECONDHEAT_DESCRIPTION: str
WINDOWS_DESCRIPTION: str
GLAZED_TYPE: str
MULTI_GLAZE_PROPORTION: float
LOW_ENERGY_LIGHTING: float
NUMBER_OPEN_FIREPLACES: float
MAINHEATCONT_DESCRIPTION: str
SOLAR_WATER_HEATING_FLAG: str
PHOTO_SUPPLY: float
TRANSACTION_TYPE: str
ENERGY_TARIFF: str
EXTENSION_COUNT: float
TOTAL_FLOOR_AREA: float
FLOOR_HEIGHT: float
HOT_WATER_ENERGY_EFF: str
FLOOR_ENERGY_EFF: str
WINDOWS_ENERGY_EFF: str
WALLS_ENERGY_EFF: str
SHEATING_ENERGY_EFF: str
ROOF_ENERGY_EFF: str
MAINHEAT_ENERGY_EFF: str
MAINHEATC_ENERGY_EFF: str
LIGHTING_ENERGY_EFF: str
POTENTIAL_ENERGY_EFFICIENCY: float
ENVIRONMENT_IMPACT_POTENTIAL: float
ENERGY_CONSUMPTION_POTENTIAL: float
CO2_EMISSIONS_POTENTIAL: float
LODGEMENT_DATE: str
CURRENT_ENERGY_EFFICIENCY: int
ENERGY_CONSUMPTION_CURRENT: int
CO2_EMISSIONS_CURRENT: float
def __post_init__(self):
# We can have validation and cleaning steps for each of the fields
# self.WALLS_DESCRIPTION = 'check'
# Could also have cleaning of records if needed
# self._field_validation()
pass
def _field_validation(self):
"""
This method will validate each of the fields in the EPC record
"""
self.validation_configuration = EPCRecordValidationConfiguration
for record_key, validation_config in self.validation_configuration.items():
# Get the variable named record key from self
field_value = self.__dict__[record_key]
if validation_config['type'] == "string":
self._validate_string(record_key, field_value, validation_config)
elif validation_config['type'] == "float":
self._validate_float(field_value, validation_config)
else:
raise ValueError(f"Validation type {validation_config['type']} not supported")
def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict):
"""
Validate a string field
"""
if not isinstance(field_value, str):
raise ValueError(f"Field {record_key} has value {field_value} which is not a string")
if 'function' in validation_config:
try:
validation_config['function'](field_value)
except:
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
if validation_config['acceptable_values'] is not None:
if field_value not in validation_config['acceptable_values']:
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}")
def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict):
"""
Validate a float field
"""
if not isinstance(field_value, float):
raise ValueError(f"Field {record_key} has value {field_value} which is not a float")
if 'function' in validation_config:
try:
validation_config['function'](field_value)
except:
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
if validation_config['range'] is not None:
if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]:
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}")
def __sub__(self, other):
"""
This method will return the difference between two EPC records
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only subtract EPCRecord from EPCRecord")
difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True)
return difference_record
def __gt__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE]
def __ge__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE]
def __lt__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE]
def __le__(self, other):
"""
This method will return True if the EPC record is greater than or equal to the other
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only compare EPCRecord to EPCRecord")
return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None):
"""
This method will return the value of the key
"""
if return_asdict:
output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key}
if key_suffix is not None:
output_dict = {f"{x}_{key_suffix}": y for x, y in output_dict.items()}
return output_dict
if isinstance(key, list):
return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key]
elif isinstance(key, str):
return self.__dict__[key] if key in self.__dict__.keys() else None
class EPCDifferenceRecord:
"""
Base class for the difference between two EPC records
"""
def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False):
"""
This method will initialise the EPCDifferenceRecord
Defaults usage is with record2 to have the higher RDSAP score
"""
self.record1 = record1
self.record2 = record2
self.difference_record = {}
if auto_sort and (self.record2 <= self.record1):
self.record1, self.record2 = self.record2, self.record1
self._construct_difference_record()
self._validate_difference_record()
def _construct_difference_record(self):
"""
This method will construct the difference record between the two records
"""
rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE)
heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE)
carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE)
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING")
starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING")
# TODO: DO we want to take the earliest potentials or max potentials?
self.difference_record = {
"UPRN": self.record1.get("UPRN"),
"RDSAP_CHANGE": rdsap_change,
"HEAT_DEMAND_CHANGE": heat_demand_change,
"CARBON_CHANGE": carbon_change,
"SAP_STARTING": self.record1.get(RDSAP_RESPONSE),
"SAP_ENDING": self.record2.get(RDSAP_RESPONSE),
"HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE),
"HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE),
"CARBON_STARTING": self.record1.get(CARBON_RESPONSE),
"CARBON_ENDING": self.record2.get(CARBON_RESPONSE),
"POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")),
"ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")),
"ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")),
"CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")),
**ending_record,
**starting_record
}
def _validate_difference_record(self):
"""
This method will validate the difference record
"""
# for key, value in self.difference_record.items():
# if key == "LODGEMENT_DATE":
# continue
# if isinstance(value, str):
# continue
# if value < 0:
# raise ValueError(f"Difference record has negative value for {key}")
pass
def compare_fields_in_records(self, fields: List[str]):
"""
This method will compare the records, for specific fields
"""
all_equal = True
for field in fields:
if self.record1.get(field) != self.record2.get(field):
return False
if all_equal:
return True
def get(self, key: str):
"""
This method will return the value of the key
"""
return self.difference_record[key] if key in self.difference_record.keys() else None
def append_fixed_data(self, fixed_data: dict):
"""
This method will append fixed data to the difference record
"""
self.difference_record.update(fixed_data)
# all_equal = True
# for col in columns:
# if earliest_record[col] != latest_record[col]:
# return False
# if all_equal:
# return True
def app():
# Get all the files in the directory
@ -686,9 +426,9 @@ def app():
cleaning_dataset.append(data_processor.cleaning_averages)
data_by_urpn = []
data_by_uprn = []
for uprn, property_data in df.groupby("UPRN", observed=True):
asdasd
# Fixed features - these are property attributes that shouldn't change over time
fixed_data = {}
@ -748,102 +488,109 @@ def app():
difference_record.append_fixed_data(fixed_data)
property_model_data.append(difference_record.difference_record)
property_model_data.append(difference_record)
for idx in range(0, property_data.shape[0] - 1):
# property_model_data.append(difference_record.difference_record)
# for idx in range(0, property_data.shape[0] - 1):
if idx >= property_data.shape[0] - 1:
break
# if idx >= property_data.shape[0] - 1:
# break
earliest_record = variable_data.iloc[idx]
latest_record = variable_data.iloc[idx + 1]
# earliest_record = variable_data.iloc[idx]
# latest_record = variable_data.iloc[idx + 1]
# Check if the sap gets better or worse
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
# # Check if the sap gets better or worse
# gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
# component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
if gets_better:
starting_sap = earliest_record[RDSAP_RESPONSE]
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
starting_carbon = earliest_record[CARBON_RESPONSE]
# if gets_better:
# starting_sap = earliest_record[RDSAP_RESPONSE]
# starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
# starting_carbon = earliest_record[CARBON_RESPONSE]
ending_sap = latest_record[RDSAP_RESPONSE]
ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
ending_carbon = latest_record[CARBON_RESPONSE]
# ending_sap = latest_record[RDSAP_RESPONSE]
# ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
# ending_carbon = latest_record[CARBON_RESPONSE]
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
# rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
# heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
# carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
else:
starting_sap = latest_record[RDSAP_RESPONSE]
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
starting_carbon = latest_record[CARBON_RESPONSE]
# starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
# ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
# else:
# starting_sap = latest_record[RDSAP_RESPONSE]
# starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
# starting_carbon = latest_record[CARBON_RESPONSE]
ending_sap = earliest_record[RDSAP_RESPONSE]
ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
ending_carbon = earliest_record[CARBON_RESPONSE]
# ending_sap = earliest_record[RDSAP_RESPONSE]
# ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
# ending_carbon = earliest_record[CARBON_RESPONSE]
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
# rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
# heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
# carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
# starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
# ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
if rdsap_change == 0:
continue
# if rdsap_change == 0:
# continue
all_equal = compare_records(
earliest_record=earliest_record,
latest_record=latest_record,
columns=CORE_COMPONENT_FEATURES
)
# all_equal = compare_records(
# earliest_record=earliest_record,
# latest_record=latest_record,
# columns=CORE_COMPONENT_FEATURES
# )
if all_equal:
# Keep track of this for the moment so we can analyse
all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
continue
asdasd
features = pd.concat([starting_record, ending_record])
# if all_equal:
# # Keep track of this for the moment so we can analyse
# all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
# continue
# asdasd
# features = pd.concat([starting_record, ending_record])
property_model_data.append(
{
"UPRN": uprn,
"RDSAP_CHANGE": rdsap_change,
"HEAT_DEMAND_CHANGE": heat_demand_change,
"CARBON_CHANGE": carbon_change,
"SAP_STARTING": starting_sap,
"SAP_ENDING": ending_sap,
"HEAT_DEMAND_STARTING": starting_heat_demand,
"HEAT_DEMAND_ENDING": ending_heat_demand,
"CARBON_STARTING": starting_carbon,
"CARBON_ENDING": ending_carbon,
"POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
"ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
"ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
"CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
**fixed_data,
**features.to_dict(),
}
)
# property_model_data.append(
# {
# "UPRN": uprn,
# "RDSAP_CHANGE": rdsap_change,
# "HEAT_DEMAND_CHANGE": heat_demand_change,
# "CARBON_CHANGE": carbon_change,
# "SAP_STARTING": starting_sap,
# "SAP_ENDING": ending_sap,
# "HEAT_DEMAND_STARTING": starting_heat_demand,
# "HEAT_DEMAND_ENDING": ending_heat_demand,
# "CARBON_STARTING": starting_carbon,
# "CARBON_ENDING": ending_carbon,
# "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
# "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
# "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
# "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
# **fixed_data,
# **features.to_dict(),
# }
# )
data_by_urpn.extend(property_model_data)
# data_by_urpn.extend(property_model_data)
data_by_uprn.extend(property_model_data)
from etl.epc.Dataset import TrainingDataset
constituency_data = TrainingDataset(datasets=data_by_uprn)
data_by_urpn_df = pd.DataFrame(data_by_urpn)
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
data_by_urpn_df["LODGEMENT_DATE_STARTING"]
)
# # TODO: can we move this into the epc record?
# data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
# data_by_urpn_df["LODGEMENT_DATE_STARTING"]
# )
data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
data_by_urpn_df["LODGEMENT_DATE_ENDING"]
)
# data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
# data_by_urpn_df["LODGEMENT_DATE_ENDING"]
# )
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
# data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
@ -889,6 +636,7 @@ def app():
output = pd.concat(dataset)
# TODO: move into difference record
# Remove any records that have huge swings in their floor area
output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]