add validation layer

This commit is contained in:
Michael Duong 2023-12-05 12:32:48 +00:00
parent 2845badbc0
commit d76dc3fc56
2 changed files with 84 additions and 6 deletions

View file

@ -0,0 +1,23 @@
"""
Specify the validation rules for each field in the differents record.
"""
def validate_walls_description(value):
if value not in ["Cavity", "Solid", "System built", "Timber frame", "Suspended timber", "Other"]:
raise ValueError("Walls description is not valid")
EPCRecordValidationConfiguration = {
"WALLS_DESCRIPTION": {
"type": "string",
"acceptable_values": ["Cavity", "Solid", "System built", "Timber frame", "Suspended timber", "Other"]
"function": validate_walls_description
},
"FLOOR_DESCRIPTION": {
"type": "string",
"acceptable_values": ["Solid", "Suspended", "Other"]
},
"ENERGY_CONSUMPTION_CURRENT": {
"type": "float",
"range": [0, 100]
}
}

View file

@ -398,6 +398,8 @@ def compare_records(earliest_record: pd.Series, latest_record: pd.Series, column
return True
from dataclasses import dataclass
from etl.epc.ValidationConfiguration import EPCRecordValidationConfiguration
from typing import Union
@dataclass
class EPCRecord:
"""
@ -443,6 +445,62 @@ class EPCRecord:
ENERGY_CONSUMPTION_CURRENT: int
CO2_EMISSIONS_CURRENT: float
def __post_init__(self):
# We can have validation and cleaning steps for each of the fields
self.WALLS_DESCRIPTION = 'check'
self._field_validation()
def _field_validation(self):
"""
This method will validate each of the fields in the EPC record
"""
self.validation_configuration = EPCRecordValidationConfiguration
for record_key, validation_config in self.validation_configuration.items():
# Get the variable named record key from self
field_value = self.__dict__[record_key]
if validation_config['type'] == "string":
self._validate_string(record_key, field_value, validation_config)
elif validation_config['type'] == "float":
self._validate_float(field_value, validation_config)
else:
raise ValueError(f"Validation type {validation_config['type']} not supported")
def _validate_string(self, record_key: str, field_value: Union[str, float], validation_config: dict):
"""
Validate a string field
"""
if 'function' in validation_config:
try:
validation_config['function'](field_value)
except:
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
if validation_config['acceptable_values'] is not None:
if field_value not in validation_config['acceptable_values']:
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable values of {validation_config['acceptable_values']}")
def _validate_float(self, record_key: str, field_value: Union[str, float], validation_config: dict):
"""
Validate a float field
"""
if 'function' in validation_config:
try:
validation_config['function'](field_value)
except:
raise ValueError(f"Field {record_key} has value {field_value} which does not pass the validation function {validation_config['function']}")
if validation_config['range'] is not None:
if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]:
raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}")
# def __init__(self, num) -> None:
# self.num = num
@ -450,11 +508,6 @@ class EPCRecord:
# return self.num - other.num
test = EPCRecord(10)
test2 = EPCRecord(20)
test - test2
def app():
# Get all the files in the directory
@ -484,7 +537,7 @@ def app():
data_by_urpn = []
for uprn, property_data in df.groupby("UPRN", observed=True):
# Fixed features - these are property attributes that shouldn't change over time
fixed_data = {}
@ -515,6 +568,8 @@ def app():
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
# e.g. first vs second, second vs third and also first vs third
property_model_data = []
temp = [EPCRecord(**x) for x in variable_data.to_dict(orient='records')]
for idx in range(0, property_data.shape[0] - 1):
if idx >= property_data.shape[0] - 1: