diff --git a/etl/epc/ValidationConfiguration.py b/etl/epc/ValidationConfiguration.py new file mode 100644 index 00000000..c2487706 --- /dev/null +++ b/etl/epc/ValidationConfiguration.py @@ -0,0 +1,23 @@ +""" +Specify the validation rules for each field in the differents record. +""" + +def validate_walls_description(value): + if value not in ["Cavity", "Solid", "System built", "Timber frame", "Suspended timber", "Other"]: + raise ValueError("Walls description is not valid") + +EPCRecordValidationConfiguration = { + "WALLS_DESCRIPTION": { + "type": "string", + "acceptable_values": ["Cavity", "Solid", "System built", "Timber frame", "Suspended timber", "Other"] + "function": validate_walls_description + }, + "FLOOR_DESCRIPTION": { + "type": "string", + "acceptable_values": ["Solid", "Suspended", "Other"] + }, + "ENERGY_CONSUMPTION_CURRENT": { + "type": "float", + "range": [0, 100] + } +} diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index 471fe7b0..a792ae4a 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -398,6 +398,8 @@ def compare_records(earliest_record: pd.Series, latest_record: pd.Series, column return True from dataclasses import dataclass +from etl.epc.ValidationConfiguration import EPCRecordValidationConfiguration +from typing import Union @dataclass class EPCRecord: """ @@ -443,6 +445,62 @@ class EPCRecord: ENERGY_CONSUMPTION_CURRENT: int CO2_EMISSIONS_CURRENT: float + def __post_init__(self): + # We can have validation and cleaning steps for each of the fields + self.WALLS_DESCRIPTION = 'check' + + self._field_validation() + + def _field_validation(self): + """ + This method will validate each of the fields in the EPC record + """ + self.validation_configuration = EPCRecordValidationConfiguration + + for record_key, validation_config in self.validation_configuration.items(): + # Get the variable named record key from self + field_value = self.__dict__[record_key] + + if validation_config['type'] == "string": + self._validate_string(record_key, field_value, validation_config) + elif validation_config['type'] == "float": + self._validate_float(field_value, validation_config) + else: + raise ValueError(f"Validation type {validation_config['type']} not supported") + + def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict): + """ + Validate a string field + """ + + if 'function' in validation_config: + try: + validation_config['function'](field_value) + except: + raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}") + + if validation_config['acceptable_values'] is not None: + if field_value not in validation_config['acceptable_values']: + raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}") + + def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict): + """ + Validate a float field + """ + + if 'function' in validation_config: + try: + validation_config['function'](field_value) + except: + raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}") + + if validation_config['range'] is not None: + if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]: + raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}") + + + + # def __init__(self, num) -> None: # self.num = num @@ -450,11 +508,6 @@ class EPCRecord: # return self.num - other.num -test = EPCRecord(10) -test2 = EPCRecord(20) -test - test2 - - def app(): # Get all the files in the directory @@ -484,7 +537,7 @@ def app(): data_by_urpn = [] for uprn, property_data in df.groupby("UPRN", observed=True): - + # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} @@ -515,6 +568,8 @@ def app(): # Note: we look at changes between subsequent EPCS, however we could look at other permutations # e.g. first vs second, second vs third and also first vs third property_model_data = [] + + temp = [EPCRecord(**x) for x in variable_data.to_dict(orient='records')] for idx in range(0, property_data.shape[0] - 1): if idx >= property_data.shape[0] - 1: