From c6fe7ca5f388152775e8135e54c4f9e6d3db08e3 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 5 Dec 2023 16:07:47 +0000 Subject: [PATCH] adding more to the difference record --- etl/epc/property_change_app.py | 164 +++++++++++++++++++++++++++++++-- 1 file changed, 156 insertions(+), 8 deletions(-) diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index 36eb31b6..da425b3a 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -399,7 +399,75 @@ def compare_records(earliest_record: pd.Series, latest_record: pd.Series, column from dataclasses import dataclass from etl.epc.ValidationConfiguration import EPCRecordValidationConfiguration -from typing import Union +from typing import Union, List + + +class EPCDifferenceRecord: + """ + Base class for the difference between two EPC records + """ + + def __init__(self, record1: EPCRecord, record2: EPCRecord, auto_sort: bool = False): + """ + This method will initialise the EPCDifferenceRecord + Defaults usage is with record1 to have the higher RDSAP score + """ + self.record1 = record1 + self.record2 = record2 + self.difference_record = {} + + if auto_sort and (self.record1 <= self.record2): + self.record1, self.record2 = self.record2, self.record1 + + self._calculate_difference_record() + self._validate_difference_record() + + def _calculate_difference_record(self): + """ + This method will calculate the difference between the two records + """ + self.difference_record = {} + for key in self.record1.__dict__.keys(): + if key == "LODGEMENT_DATE": + continue + self.difference_record[key] = self.record1.get(key) + # - self.record2.get(key) + + def _validate_difference_record(self): + """ + This method will validate the difference record + """ + # for key, value in self.difference_record.items(): + # if key == "LODGEMENT_DATE": + # continue + # if isinstance(value, str): + # continue + # if value < 0: + # raise ValueError(f"Difference record has negative value for {key}") + pass + + def compare_fields_in_records(self, fields: List[str]): + """ + This method will compare the records, for specific fields + """ + + all_equal = True + for field in fields: + if self.record1.get(field) != self.record2.get(field): + return False + + if all_equal: + return True + + def get(self, key: str): + """ + This method will return the value of the key + """ + return self.difference_record[key] + + + + @dataclass class EPCRecord: """ @@ -447,9 +515,10 @@ class EPCRecord: def __post_init__(self): # We can have validation and cleaning steps for each of the fields - self.WALLS_DESCRIPTION = 'check' - - self._field_validation() + # self.WALLS_DESCRIPTION = 'check' + # Could also have cleaning of records if needed + # self._field_validation() + pass def _field_validation(self): """ @@ -500,9 +569,61 @@ class EPCRecord: if validation_config['range'] is not None: if field_value < validation_config['range'][0] or field_value > validation_config['range'][1]: - raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}") - + raise ValueError(f"Field {record_key} has value {field_value} which is not in the acceptable range of {validation_config['range']}") + def __sub__(self, other): + """ + This method will return the difference between two EPC records + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only subtract EPCRecord from EPCRecord") + + difference_record = EPCDifferenceRecord(record1=self, record2=other, auto_sort=True) + + return difference_record + + def __gt__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] > other.__dict__[RDSAP_RESPONSE] + + def __ge__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] >= other.__dict__[RDSAP_RESPONSE] + + def __lt__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] < other.__dict__[RDSAP_RESPONSE] + + def __le__(self, other): + """ + This method will return True if the EPC record is greater than or equal to the other + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only compare EPCRecord to EPCRecord") + + return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE] + + def get(self, key: str): + """ + This method will return the value of the key + """ + return self.__dict__[key] + # def __init__(self, num) -> None: @@ -573,7 +694,34 @@ def app(): # e.g. first vs second, second vs third and also first vs third property_model_data = [] - temp = [EPCRecord(**x) for x in variable_data.to_dict(orient='records')] + epc_records = [EPCRecord(**x) for x in variable_data.to_dict(orient='records')] + + for idx in range(0, len(epc_records) - 1): + + if idx >= len(epc_records) - 1: + break + + earliest_record: EPCRecord = epc_records[idx] + latest_record: EPCRecord = epc_records[idx + 1] + + # Auto sort the records so that the record with highest RDSAP score is always record1 + difference_record: EPCDifferenceRecord = latest_record - earliest_record + + # TODO: Pull out RDSAP_CHANGE to a variable + if difference_record.get("RDSAP_CHANGE") == 0: + continue + + all_equal = difference_record.compare_fields_in_records( + fields=CORE_COMPONENT_FEATURES + ) + + if all_equal: + # Keep track of this for the moment so we can analyse + all_equal_rows.append({"uprn": uprn, "directory_name": directory.name}) + continue + + property_model_data.append(difference_record.difference_record) + for idx in range(0, property_data.shape[0] - 1): if idx >= property_data.shape[0] - 1: @@ -631,7 +779,7 @@ def app(): # Keep track of this for the moment so we can analyse all_equal_rows.append({"uprn": uprn, "directory_name": directory.name}) continue - + asdasd features = pd.concat([starting_record, ending_record]) property_model_data.append(