mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
breakout classes
This commit is contained in:
parent
05c01c1770
commit
353e8a90db
4 changed files with 533 additions and 354 deletions
107
etl/epc/Dataset.py
Normal file
107
etl/epc/Dataset.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
import pandas as pd
|
||||
from typing import List
|
||||
from etl.epc.EPCRecord import EPCDifferenceRecord
|
||||
|
||||
class TrainingDataset:
|
||||
"""
|
||||
A collection of EPCDifferenceRecords can be combined into a TrainingDataset.
|
||||
"""
|
||||
|
||||
def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
|
||||
self.datasets = datasets
|
||||
self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
|
||||
|
||||
self._feature_generation()
|
||||
self._drop_features()
|
||||
self._clean_dataframe()
|
||||
self._clean_efficiency_variables(self.df)
|
||||
|
||||
def _drop_features(self):
|
||||
"""
|
||||
Drop features that are not needed for modelling
|
||||
"""
|
||||
self.df = self.df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||
|
||||
|
||||
def _feature_generation(self):
|
||||
"""
|
||||
Generate features for modelling
|
||||
"""
|
||||
self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"])
|
||||
self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"])
|
||||
|
||||
@staticmethod
|
||||
def _clean_efficiency_variables(df):
|
||||
|
||||
"""
|
||||
These is scope to clean this by the model per corresponding description.
|
||||
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
|
||||
fill in the missing values with this.
|
||||
When looking at this initially, there are a large volume of records with missing energy efficiency
|
||||
values and therefore a simpler approach was taken just to test including these variables
|
||||
:param df:
|
||||
:return:
|
||||
"""
|
||||
|
||||
missings = pd.isnull(df).sum()
|
||||
missings = missings[missings >= 1]
|
||||
|
||||
if len(missings) == 0:
|
||||
return df
|
||||
|
||||
# Make sure they are all efficiency columns
|
||||
if any(~missings.index.str.contains("ENERGY_EFF")):
|
||||
raise ValueError("Non efficiency columns are missing")
|
||||
|
||||
for m in missings.index:
|
||||
df[m] = df[m].fillna("NO_RATING")
|
||||
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def _calculate_days_to(lodgement_date):
|
||||
|
||||
if isinstance(lodgement_date, str):
|
||||
return (
|
||||
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).daye
|
||||
|
||||
return (
|
||||
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).dt.days
|
||||
|
||||
def __add__(self, other) -> "TrainingDataset":
|
||||
if not isinstance(other, TrainingDataset):
|
||||
raise TypeError("Addition can only be performed with another instance of TrainingDataset")
|
||||
return TrainingDataset(self.datasets + other.datasets)
|
||||
|
||||
def __radd__(self, other):
|
||||
"""
|
||||
Required for sum() to work
|
||||
"""
|
||||
if isinstance(other, int):
|
||||
return self
|
||||
else:
|
||||
return self.__add__(other)
|
||||
|
||||
class ScoringDataset:
|
||||
"""
|
||||
A collection of EPCDifferenceRecords can be combined into a ScoringDataset.
|
||||
"""
|
||||
|
||||
def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
|
||||
self.datasets = datasets
|
||||
|
||||
def __add__(self, other) -> "ScoringDataset":
|
||||
if not isinstance(other, ScoringDataset):
|
||||
raise TypeError("Addition can only be performed with another instance of ScoringDataset")
|
||||
return ScoringDataset(self.datasets + other.datasets)
|
||||
|
||||
def __radd__(self, other):
|
||||
"""
|
||||
Required for sum() to work
|
||||
"""
|
||||
if isinstance(other, int):
|
||||
return self
|
||||
else:
|
||||
return self.__add__(other)
|
||||
290
etl/epc/EPCRecord.py
Normal file
290
etl/epc/EPCRecord.py
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
from etl.epc.ValidationConfiguration import (
|
||||
EPCRecordValidationConfiguration,
|
||||
EPCDifferenceRecordValidationConfiguration,
|
||||
EPCDifferenceRecordFixedDataValidationConfiguration
|
||||
)
|
||||
from typing import Union, List
|
||||
from etl.epc.settings import (
|
||||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
CARBON_RESPONSE,
|
||||
COMPONENT_FEATURES,
|
||||
EFFICIENCY_FEATURES
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class EPCRecord:
|
||||
"""
|
||||
Base class for a EPC record
|
||||
"""
|
||||
UPRN: str
|
||||
WALLS_DESCRIPTION: str
|
||||
FLOOR_DESCRIPTION: str
|
||||
LIGHTING_DESCRIPTION: str
|
||||
ROOF_DESCRIPTION: str
|
||||
MAINHEAT_DESCRIPTION: str
|
||||
HOTWATER_DESCRIPTION: str
|
||||
MAIN_FUEL: str
|
||||
MECHANICAL_VENTILATION: str
|
||||
SECONDHEAT_DESCRIPTION: str
|
||||
WINDOWS_DESCRIPTION: str
|
||||
GLAZED_TYPE: str
|
||||
MULTI_GLAZE_PROPORTION: float
|
||||
LOW_ENERGY_LIGHTING: float
|
||||
NUMBER_OPEN_FIREPLACES: float
|
||||
MAINHEATCONT_DESCRIPTION: str
|
||||
SOLAR_WATER_HEATING_FLAG: str
|
||||
PHOTO_SUPPLY: float
|
||||
TRANSACTION_TYPE: str
|
||||
ENERGY_TARIFF: str
|
||||
EXTENSION_COUNT: float
|
||||
TOTAL_FLOOR_AREA: float
|
||||
FLOOR_HEIGHT: float
|
||||
HOT_WATER_ENERGY_EFF: str
|
||||
FLOOR_ENERGY_EFF: str
|
||||
WINDOWS_ENERGY_EFF: str
|
||||
WALLS_ENERGY_EFF: str
|
||||
SHEATING_ENERGY_EFF: str
|
||||
ROOF_ENERGY_EFF: str
|
||||
MAINHEAT_ENERGY_EFF: str
|
||||
MAINHEATC_ENERGY_EFF: str
|
||||
LIGHTING_ENERGY_EFF: str
|
||||
POTENTIAL_ENERGY_EFFICIENCY: float
|
||||
ENVIRONMENT_IMPACT_POTENTIAL: float
|
||||
ENERGY_CONSUMPTION_POTENTIAL: float
|
||||
CO2_EMISSIONS_POTENTIAL: float
|
||||
LODGEMENT_DATE: str
|
||||
CURRENT_ENERGY_EFFICIENCY: int
|
||||
ENERGY_CONSUMPTION_CURRENT: int
|
||||
CO2_EMISSIONS_CURRENT: float
|
||||
|
||||
def __post_init__(self):
|
||||
# We can have validation and cleaning steps for each of the fields
|
||||
# self.WALLS_DESCRIPTION = 'check'
|
||||
# Could also have cleaning of records if needed
|
||||
self.validation_configuration = EPCRecordValidationConfiguration
|
||||
|
||||
# self._field_validation()
|
||||
pass
|
||||
|
||||
def _field_validation(self):
|
||||
"""
|
||||
This method will validate each of the fields in the EPC record
|
||||
"""
|
||||
|
||||
for record_key, validation_config in self.validation_configuration.items():
|
||||
# Get the variable named record key from self
|
||||
field_value = self.__dict__[record_key]
|
||||
|
||||
if validation_config['type'] == "string":
|
||||
self._validate_string(record_key, field_value, validation_config)
|
||||
elif validation_config['type'] == "float":
|
||||
self._validate_float(field_value, validation_config)
|
||||
else:
|
||||
raise ValueError(f"Validation type {validation_config['type']} not supported")
|
||||
|
||||
def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict):
|
||||
"""
|
||||
Validate a string field
|
||||
"""
|
||||
if not isinstance(field_value, str):
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not a string")
|
||||
|
||||
if 'function' in validation_config:
|
||||
try:
|
||||
validation_config['function'](field_value)
|
||||
except:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
|
||||
|
||||
if validation_config['acceptable_values'] is not None:
|
||||
if field_value not in validation_config['acceptable_values']:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}")
|
||||
|
||||
def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict):
|
||||
"""
|
||||
Validate a float field
|
||||
"""
|
||||
if not isinstance(field_value, float):
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not a float")
|
||||
|
||||
if 'function' in validation_config:
|
||||
try:
|
||||
validation_config['function'](field_value)
|
||||
except:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
|
||||
|
||||
if validation_config['range'] is not None:
|
||||
if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}")
|
||||
|
||||
def __sub__(self, other):
|
||||
"""
|
||||
This method will return the difference between two EPC records
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only subtract EPCRecord from EPCRecord")
|
||||
|
||||
difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True)
|
||||
|
||||
return difference_record
|
||||
|
||||
def __gt__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def __ge__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def __lt__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def __le__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None):
|
||||
"""
|
||||
This method will return the value of the key
|
||||
"""
|
||||
if return_asdict:
|
||||
output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key}
|
||||
if key_suffix is not None:
|
||||
output_dict = {f"{x}{key_suffix}": y for x, y in output_dict.items()}
|
||||
return output_dict
|
||||
|
||||
if isinstance(key, list):
|
||||
return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key]
|
||||
elif isinstance(key, str):
|
||||
return self.__dict__[key] if key in self.__dict__.keys() else None
|
||||
|
||||
|
||||
class EPCDifferenceRecord:
|
||||
"""
|
||||
Base class for the difference between two EPC records
|
||||
"""
|
||||
|
||||
def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False):
|
||||
"""
|
||||
This method will initialise the EPCDifferenceRecord
|
||||
Defaults usage is with record2 to have the higher RDSAP score
|
||||
"""
|
||||
self.record1 = record1
|
||||
self.record2 = record2
|
||||
self.difference_record = {}
|
||||
|
||||
self.difference_validation_configuration = EPCDifferenceRecordValidationConfiguration
|
||||
self.fixed_data_validation_configuration = EPCDifferenceRecordFixedDataValidationConfiguration
|
||||
|
||||
if auto_sort and (self.record2 <= self.record1):
|
||||
self.record1, self.record2 = self.record2, self.record1
|
||||
|
||||
self._construct_difference_record()
|
||||
self._validate_difference_record()
|
||||
|
||||
|
||||
|
||||
def _construct_difference_record(self):
|
||||
"""
|
||||
This method will construct the difference record between the two records
|
||||
"""
|
||||
|
||||
rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE)
|
||||
heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE)
|
||||
carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE)
|
||||
|
||||
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
|
||||
ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING")
|
||||
starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING")
|
||||
|
||||
# TODO: DO we want to take the earliest potentials or max potentials?
|
||||
self.difference_record = {
|
||||
"UPRN": self.record1.get("UPRN"),
|
||||
"RDSAP_CHANGE": rdsap_change,
|
||||
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
"CARBON_CHANGE": carbon_change,
|
||||
"SAP_STARTING": self.record1.get(RDSAP_RESPONSE),
|
||||
"SAP_ENDING": self.record2.get(RDSAP_RESPONSE),
|
||||
"HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE),
|
||||
"HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE),
|
||||
"CARBON_STARTING": self.record1.get(CARBON_RESPONSE),
|
||||
"CARBON_ENDING": self.record2.get(CARBON_RESPONSE),
|
||||
"POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")),
|
||||
"ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")),
|
||||
"ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")),
|
||||
"CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")),
|
||||
**ending_record,
|
||||
**starting_record
|
||||
}
|
||||
|
||||
def _validate_difference_record(self):
|
||||
"""
|
||||
This method will validate the difference record
|
||||
"""
|
||||
# for key, value in self.difference_record.items():
|
||||
# if key == "LODGEMENT_DATE":
|
||||
# continue
|
||||
# if isinstance(value, str):
|
||||
# continue
|
||||
# if value < 0:
|
||||
# raise ValueError(f"Difference record has negative value for {key}")
|
||||
pass
|
||||
|
||||
def compare_fields_in_records(self, fields: List[str]):
|
||||
"""
|
||||
This method will compare the records, for specific fields
|
||||
"""
|
||||
|
||||
all_equal = True
|
||||
for field in fields:
|
||||
if self.record1.get(field) != self.record2.get(field):
|
||||
return False
|
||||
|
||||
if all_equal:
|
||||
return True
|
||||
|
||||
def get(self, key: str):
|
||||
"""
|
||||
This method will return the value of the key
|
||||
"""
|
||||
return self.difference_record[key] if key in self.difference_record.keys() else None
|
||||
|
||||
def append_fixed_data(self, fixed_data: dict):
|
||||
"""
|
||||
This method will append fixed data to the difference record
|
||||
"""
|
||||
self._validate_fixed_data(fixed_data)
|
||||
self.difference_record.update(fixed_data)
|
||||
|
||||
def _validate_fixed_data(self, fixed_data: dict):
|
||||
"""
|
||||
This method will validate the fixed data
|
||||
"""
|
||||
|
||||
# Can have more sophisticated checks here
|
||||
# self.fixed_data_validataion_configuration
|
||||
|
||||
pass
|
||||
|
|
@ -21,3 +21,37 @@ EPCRecordValidationConfiguration = {
|
|||
"range": [0, 100]
|
||||
}
|
||||
}
|
||||
|
||||
EPCDifferenceRecordValidationConfiguration = {
|
||||
}
|
||||
|
||||
EPCDifferenceRecordFixedDataValidationConfiguration = {
|
||||
"PROPERTY_TYPE": {
|
||||
"type": "string",
|
||||
"acceptable_values": ["House", "Flat", "Bungalow", "Maisonette", "Park home", "Other"]
|
||||
},
|
||||
"BUILT_FORM": {
|
||||
"type": "string",
|
||||
"acceptable_values": ["Detached", "Semi-Detached", "End-Terrace", "Mid-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace", "Enclosed Detached", "Not applicable"]
|
||||
},
|
||||
"CONSITUENCY": {
|
||||
"type": "string",
|
||||
"acceptable_values": ["England", "Wales", "Scotland", "Northern Ireland"]
|
||||
},
|
||||
"NUMBER_HABITABLE_ROOMS": {
|
||||
"type": "integer",
|
||||
"range": [0, 100]
|
||||
},
|
||||
"NUMBER_HEATED_ROOMS": {
|
||||
"type": "integer",
|
||||
"range": [0, 100]
|
||||
},
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT": {
|
||||
"type": "integer",
|
||||
"range": [0, 100]
|
||||
},
|
||||
"CONSTRUCTION_AGE_BAND": {
|
||||
"type": "string",
|
||||
"acceptable_values": []
|
||||
}
|
||||
}
|
||||
|
|
@ -18,6 +18,7 @@ from etl.epc.settings import (
|
|||
MINIMUM_FLOOR_HEIGHT
|
||||
)
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from etl.epc.EPCRecord import EPCRecord, EPCDifferenceRecord
|
||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
|
||||
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
||||
from recommendations.recommendation_utils import (
|
||||
|
|
@ -223,6 +224,7 @@ def make_uvalues(df):
|
|||
df["row_index"] = df.index
|
||||
|
||||
uvalues = []
|
||||
# TODO: iterrows is the slowest way to do this, we should use a vectorised approach or itertuples
|
||||
for _, x in df.iterrows():
|
||||
|
||||
uprn = x["UPRN"]
|
||||
|
|
@ -379,285 +381,23 @@ def make_uvalues(df):
|
|||
return df
|
||||
|
||||
|
||||
def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
|
||||
"""
|
||||
For a list of columns, check if the earliest and latest record are the same
|
||||
If they are the same, we indicate this, because we have example of SAP scores changing
|
||||
without any feature changes
|
||||
:param earliest_record: pd.Series
|
||||
:param latest_record: pd.Series
|
||||
:param columns: list of columns to compare
|
||||
:return: boolean indicating whether or not all features are the same
|
||||
"""
|
||||
# def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
|
||||
# """
|
||||
# For a list of columns, check if the earliest and latest record are the same
|
||||
# If they are the same, we indicate this, because we have example of SAP scores changing
|
||||
# without any feature changes
|
||||
# :param earliest_record: pd.Series
|
||||
# :param latest_record: pd.Series
|
||||
# :param columns: list of columns to compare
|
||||
# :return: boolean indicating whether or not all features are the same
|
||||
# """
|
||||
|
||||
all_equal = True
|
||||
for col in columns:
|
||||
if earliest_record[col] != latest_record[col]:
|
||||
return False
|
||||
if all_equal:
|
||||
return True
|
||||
|
||||
from dataclasses import dataclass
|
||||
from etl.epc.ValidationConfiguration import EPCRecordValidationConfiguration
|
||||
from typing import Union, List
|
||||
|
||||
@dataclass
|
||||
class EPCRecord:
|
||||
"""
|
||||
Base class for a EPC record
|
||||
"""
|
||||
UPRN: str
|
||||
WALLS_DESCRIPTION: str
|
||||
FLOOR_DESCRIPTION: str
|
||||
LIGHTING_DESCRIPTION: str
|
||||
ROOF_DESCRIPTION: str
|
||||
MAINHEAT_DESCRIPTION: str
|
||||
HOTWATER_DESCRIPTION: str
|
||||
MAIN_FUEL: str
|
||||
MECHANICAL_VENTILATION: str
|
||||
SECONDHEAT_DESCRIPTION: str
|
||||
WINDOWS_DESCRIPTION: str
|
||||
GLAZED_TYPE: str
|
||||
MULTI_GLAZE_PROPORTION: float
|
||||
LOW_ENERGY_LIGHTING: float
|
||||
NUMBER_OPEN_FIREPLACES: float
|
||||
MAINHEATCONT_DESCRIPTION: str
|
||||
SOLAR_WATER_HEATING_FLAG: str
|
||||
PHOTO_SUPPLY: float
|
||||
TRANSACTION_TYPE: str
|
||||
ENERGY_TARIFF: str
|
||||
EXTENSION_COUNT: float
|
||||
TOTAL_FLOOR_AREA: float
|
||||
FLOOR_HEIGHT: float
|
||||
HOT_WATER_ENERGY_EFF: str
|
||||
FLOOR_ENERGY_EFF: str
|
||||
WINDOWS_ENERGY_EFF: str
|
||||
WALLS_ENERGY_EFF: str
|
||||
SHEATING_ENERGY_EFF: str
|
||||
ROOF_ENERGY_EFF: str
|
||||
MAINHEAT_ENERGY_EFF: str
|
||||
MAINHEATC_ENERGY_EFF: str
|
||||
LIGHTING_ENERGY_EFF: str
|
||||
POTENTIAL_ENERGY_EFFICIENCY: float
|
||||
ENVIRONMENT_IMPACT_POTENTIAL: float
|
||||
ENERGY_CONSUMPTION_POTENTIAL: float
|
||||
CO2_EMISSIONS_POTENTIAL: float
|
||||
LODGEMENT_DATE: str
|
||||
CURRENT_ENERGY_EFFICIENCY: int
|
||||
ENERGY_CONSUMPTION_CURRENT: int
|
||||
CO2_EMISSIONS_CURRENT: float
|
||||
|
||||
def __post_init__(self):
|
||||
# We can have validation and cleaning steps for each of the fields
|
||||
# self.WALLS_DESCRIPTION = 'check'
|
||||
# Could also have cleaning of records if needed
|
||||
# self._field_validation()
|
||||
pass
|
||||
|
||||
def _field_validation(self):
|
||||
"""
|
||||
This method will validate each of the fields in the EPC record
|
||||
"""
|
||||
self.validation_configuration = EPCRecordValidationConfiguration
|
||||
|
||||
for record_key, validation_config in self.validation_configuration.items():
|
||||
# Get the variable named record key from self
|
||||
field_value = self.__dict__[record_key]
|
||||
|
||||
if validation_config['type'] == "string":
|
||||
self._validate_string(record_key, field_value, validation_config)
|
||||
elif validation_config['type'] == "float":
|
||||
self._validate_float(field_value, validation_config)
|
||||
else:
|
||||
raise ValueError(f"Validation type {validation_config['type']} not supported")
|
||||
|
||||
def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict):
|
||||
"""
|
||||
Validate a string field
|
||||
"""
|
||||
if not isinstance(field_value, str):
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not a string")
|
||||
|
||||
if 'function' in validation_config:
|
||||
try:
|
||||
validation_config['function'](field_value)
|
||||
except:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
|
||||
|
||||
if validation_config['acceptable_values'] is not None:
|
||||
if field_value not in validation_config['acceptable_values']:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}")
|
||||
|
||||
def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict):
|
||||
"""
|
||||
Validate a float field
|
||||
"""
|
||||
if not isinstance(field_value, float):
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not a float")
|
||||
|
||||
if 'function' in validation_config:
|
||||
try:
|
||||
validation_config['function'](field_value)
|
||||
except:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
|
||||
|
||||
if validation_config['range'] is not None:
|
||||
if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]:
|
||||
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}")
|
||||
|
||||
def __sub__(self, other):
|
||||
"""
|
||||
This method will return the difference between two EPC records
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only subtract EPCRecord from EPCRecord")
|
||||
|
||||
difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True)
|
||||
|
||||
return difference_record
|
||||
|
||||
def __gt__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def __ge__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def __lt__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def __le__(self, other):
|
||||
"""
|
||||
This method will return True if the EPC record is greater than or equal to the other
|
||||
"""
|
||||
if not isinstance(other, EPCRecord):
|
||||
raise ValueError("Can only compare EPCRecord to EPCRecord")
|
||||
|
||||
return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
|
||||
|
||||
def get(self, key: Union[str, List[str]], return_asdict: bool = False, key_suffix: str = None):
|
||||
"""
|
||||
This method will return the value of the key
|
||||
"""
|
||||
if return_asdict:
|
||||
output_dict = {x: self.__dict__[x] if x in self.__dict__.keys() else None for x in key}
|
||||
if key_suffix is not None:
|
||||
output_dict = {f"{x}_{key_suffix}": y for x, y in output_dict.items()}
|
||||
return output_dict
|
||||
|
||||
if isinstance(key, list):
|
||||
return [self.__dict__[x] if x in self.__dict__.keys() else None for x in key]
|
||||
elif isinstance(key, str):
|
||||
return self.__dict__[key] if key in self.__dict__.keys() else None
|
||||
|
||||
|
||||
class EPCDifferenceRecord:
|
||||
"""
|
||||
Base class for the difference between two EPC records
|
||||
"""
|
||||
|
||||
def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False):
|
||||
"""
|
||||
This method will initialise the EPCDifferenceRecord
|
||||
Defaults usage is with record2 to have the higher RDSAP score
|
||||
"""
|
||||
self.record1 = record1
|
||||
self.record2 = record2
|
||||
self.difference_record = {}
|
||||
|
||||
if auto_sort and (self.record2 <= self.record1):
|
||||
self.record1, self.record2 = self.record2, self.record1
|
||||
|
||||
self._construct_difference_record()
|
||||
self._validate_difference_record()
|
||||
|
||||
def _construct_difference_record(self):
|
||||
"""
|
||||
This method will construct the difference record between the two records
|
||||
"""
|
||||
|
||||
rdsap_change = self.record2.get(RDSAP_RESPONSE) - self.record1.get(RDSAP_RESPONSE)
|
||||
heat_demand_change = self.record2.get(HEAT_DEMAND_RESPONSE) - self.record1.get(HEAT_DEMAND_RESPONSE)
|
||||
carbon_change = self.record2.get(CARBON_RESPONSE) - self.record1.get(CARBON_RESPONSE)
|
||||
|
||||
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
|
||||
ending_record = self.record2.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_ENDING")
|
||||
starting_record = self.record1.get(component_variables + ["LODGEMENT_DATE"], return_asdict=True, key_suffix="_STARTING")
|
||||
|
||||
# TODO: DO we want to take the earliest potentials or max potentials?
|
||||
self.difference_record = {
|
||||
"UPRN": self.record1.get("UPRN"),
|
||||
"RDSAP_CHANGE": rdsap_change,
|
||||
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
"CARBON_CHANGE": carbon_change,
|
||||
"SAP_STARTING": self.record1.get(RDSAP_RESPONSE),
|
||||
"SAP_ENDING": self.record2.get(RDSAP_RESPONSE),
|
||||
"HEAT_DEMAND_STARTING": self.record1.get(HEAT_DEMAND_RESPONSE),
|
||||
"HEAT_DEMAND_ENDING": self.record2.get(HEAT_DEMAND_RESPONSE),
|
||||
"CARBON_STARTING": self.record1.get(CARBON_RESPONSE),
|
||||
"CARBON_ENDING": self.record2.get(CARBON_RESPONSE),
|
||||
"POTENTIAL_ENERGY_EFFICIENCY": max(self.record1.get("POTENTIAL_ENERGY_EFFICIENCY"), self.record2.get("POTENTIAL_ENERGY_EFFICIENCY")),
|
||||
"ENVIRONMENT_IMPACT_POTENTIAL": max(self.record1.get("ENVIRONMENT_IMPACT_POTENTIAL"), self.record2.get("ENVIRONMENT_IMPACT_POTENTIAL")),
|
||||
"ENERGY_CONSUMPTION_POTENTIAL": max(self.record1.get("ENERGY_CONSUMPTION_POTENTIAL"), self.record2.get("ENERGY_CONSUMPTION_POTENTIAL")),
|
||||
"CO2_EMISSIONS_POTENTIAL": max(self.record1.get("CO2_EMISSIONS_POTENTIAL"), self.record2.get("CO2_EMISSIONS_POTENTIAL")),
|
||||
**ending_record,
|
||||
**starting_record
|
||||
}
|
||||
|
||||
def _validate_difference_record(self):
|
||||
"""
|
||||
This method will validate the difference record
|
||||
"""
|
||||
# for key, value in self.difference_record.items():
|
||||
# if key == "LODGEMENT_DATE":
|
||||
# continue
|
||||
# if isinstance(value, str):
|
||||
# continue
|
||||
# if value < 0:
|
||||
# raise ValueError(f"Difference record has negative value for {key}")
|
||||
pass
|
||||
|
||||
def compare_fields_in_records(self, fields: List[str]):
|
||||
"""
|
||||
This method will compare the records, for specific fields
|
||||
"""
|
||||
|
||||
all_equal = True
|
||||
for field in fields:
|
||||
if self.record1.get(field) != self.record2.get(field):
|
||||
return False
|
||||
|
||||
if all_equal:
|
||||
return True
|
||||
|
||||
def get(self, key: str):
|
||||
"""
|
||||
This method will return the value of the key
|
||||
"""
|
||||
return self.difference_record[key] if key in self.difference_record.keys() else None
|
||||
|
||||
def append_fixed_data(self, fixed_data: dict):
|
||||
"""
|
||||
This method will append fixed data to the difference record
|
||||
"""
|
||||
self.difference_record.update(fixed_data)
|
||||
# all_equal = True
|
||||
# for col in columns:
|
||||
# if earliest_record[col] != latest_record[col]:
|
||||
# return False
|
||||
# if all_equal:
|
||||
# return True
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
|
@ -686,9 +426,9 @@ def app():
|
|||
|
||||
cleaning_dataset.append(data_processor.cleaning_averages)
|
||||
|
||||
data_by_urpn = []
|
||||
data_by_uprn = []
|
||||
for uprn, property_data in df.groupby("UPRN", observed=True):
|
||||
asdasd
|
||||
|
||||
# Fixed features - these are property attributes that shouldn't change over time
|
||||
fixed_data = {}
|
||||
|
||||
|
|
@ -748,102 +488,109 @@ def app():
|
|||
|
||||
difference_record.append_fixed_data(fixed_data)
|
||||
|
||||
property_model_data.append(difference_record.difference_record)
|
||||
property_model_data.append(difference_record)
|
||||
|
||||
for idx in range(0, property_data.shape[0] - 1):
|
||||
# property_model_data.append(difference_record.difference_record)
|
||||
|
||||
# for idx in range(0, property_data.shape[0] - 1):
|
||||
|
||||
if idx >= property_data.shape[0] - 1:
|
||||
break
|
||||
# if idx >= property_data.shape[0] - 1:
|
||||
# break
|
||||
|
||||
earliest_record = variable_data.iloc[idx]
|
||||
latest_record = variable_data.iloc[idx + 1]
|
||||
# earliest_record = variable_data.iloc[idx]
|
||||
# latest_record = variable_data.iloc[idx + 1]
|
||||
|
||||
# Check if the sap gets better or worse
|
||||
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
|
||||
# # Check if the sap gets better or worse
|
||||
# gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
|
||||
|
||||
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
|
||||
# component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
|
||||
|
||||
if gets_better:
|
||||
starting_sap = earliest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
starting_carbon = earliest_record[CARBON_RESPONSE]
|
||||
# if gets_better:
|
||||
# starting_sap = earliest_record[RDSAP_RESPONSE]
|
||||
# starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
# starting_carbon = earliest_record[CARBON_RESPONSE]
|
||||
|
||||
ending_sap = latest_record[RDSAP_RESPONSE]
|
||||
ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
ending_carbon = latest_record[CARBON_RESPONSE]
|
||||
# ending_sap = latest_record[RDSAP_RESPONSE]
|
||||
# ending_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
# ending_carbon = latest_record[CARBON_RESPONSE]
|
||||
|
||||
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
|
||||
# rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
|
||||
# heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
# carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
else:
|
||||
starting_sap = latest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
starting_carbon = latest_record[CARBON_RESPONSE]
|
||||
# starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
# ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
# else:
|
||||
# starting_sap = latest_record[RDSAP_RESPONSE]
|
||||
# starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
# starting_carbon = latest_record[CARBON_RESPONSE]
|
||||
|
||||
ending_sap = earliest_record[RDSAP_RESPONSE]
|
||||
ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
ending_carbon = earliest_record[CARBON_RESPONSE]
|
||||
# ending_sap = earliest_record[RDSAP_RESPONSE]
|
||||
# ending_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
# ending_carbon = earliest_record[CARBON_RESPONSE]
|
||||
|
||||
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
|
||||
# rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
||||
# heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
# carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
# starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
# ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
|
||||
if rdsap_change == 0:
|
||||
continue
|
||||
# if rdsap_change == 0:
|
||||
# continue
|
||||
|
||||
all_equal = compare_records(
|
||||
earliest_record=earliest_record,
|
||||
latest_record=latest_record,
|
||||
columns=CORE_COMPONENT_FEATURES
|
||||
)
|
||||
# all_equal = compare_records(
|
||||
# earliest_record=earliest_record,
|
||||
# latest_record=latest_record,
|
||||
# columns=CORE_COMPONENT_FEATURES
|
||||
# )
|
||||
|
||||
if all_equal:
|
||||
# Keep track of this for the moment so we can analyse
|
||||
all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
|
||||
continue
|
||||
asdasd
|
||||
features = pd.concat([starting_record, ending_record])
|
||||
# if all_equal:
|
||||
# # Keep track of this for the moment so we can analyse
|
||||
# all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
|
||||
# continue
|
||||
# asdasd
|
||||
# features = pd.concat([starting_record, ending_record])
|
||||
|
||||
property_model_data.append(
|
||||
{
|
||||
"UPRN": uprn,
|
||||
"RDSAP_CHANGE": rdsap_change,
|
||||
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
"CARBON_CHANGE": carbon_change,
|
||||
"SAP_STARTING": starting_sap,
|
||||
"SAP_ENDING": ending_sap,
|
||||
"HEAT_DEMAND_STARTING": starting_heat_demand,
|
||||
"HEAT_DEMAND_ENDING": ending_heat_demand,
|
||||
"CARBON_STARTING": starting_carbon,
|
||||
"CARBON_ENDING": ending_carbon,
|
||||
"POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
|
||||
"ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
|
||||
"ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
|
||||
"CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
|
||||
**fixed_data,
|
||||
**features.to_dict(),
|
||||
}
|
||||
)
|
||||
# property_model_data.append(
|
||||
# {
|
||||
# "UPRN": uprn,
|
||||
# "RDSAP_CHANGE": rdsap_change,
|
||||
# "HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
# "CARBON_CHANGE": carbon_change,
|
||||
# "SAP_STARTING": starting_sap,
|
||||
# "SAP_ENDING": ending_sap,
|
||||
# "HEAT_DEMAND_STARTING": starting_heat_demand,
|
||||
# "HEAT_DEMAND_ENDING": ending_heat_demand,
|
||||
# "CARBON_STARTING": starting_carbon,
|
||||
# "CARBON_ENDING": ending_carbon,
|
||||
# "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
|
||||
# "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
|
||||
# "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
|
||||
# "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
|
||||
# **fixed_data,
|
||||
# **features.to_dict(),
|
||||
# }
|
||||
# )
|
||||
|
||||
data_by_urpn.extend(property_model_data)
|
||||
# data_by_urpn.extend(property_model_data)
|
||||
data_by_uprn.extend(property_model_data)
|
||||
|
||||
from etl.epc.Dataset import TrainingDataset
|
||||
constituency_data = TrainingDataset(datasets=data_by_uprn)
|
||||
|
||||
data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
||||
|
||||
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
|
||||
data_by_urpn_df["LODGEMENT_DATE_STARTING"]
|
||||
)
|
||||
# # TODO: can we move this into the epc record?
|
||||
# data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
|
||||
# data_by_urpn_df["LODGEMENT_DATE_STARTING"]
|
||||
# )
|
||||
|
||||
data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
|
||||
data_by_urpn_df["LODGEMENT_DATE_ENDING"]
|
||||
)
|
||||
# data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
|
||||
# data_by_urpn_df["LODGEMENT_DATE_ENDING"]
|
||||
# )
|
||||
|
||||
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||
# data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||
|
||||
data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
|
||||
|
||||
|
|
@ -889,6 +636,7 @@ def app():
|
|||
|
||||
output = pd.concat(dataset)
|
||||
|
||||
# TODO: move into difference record
|
||||
# Remove any records that have huge swings in their floor area
|
||||
output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
|
||||
output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue