diff --git a/BaseUtility.py b/BaseUtility.py index e799144d..1a31c5d0 100644 --- a/BaseUtility.py +++ b/BaseUtility.py @@ -1,54 +1,14 @@ +from etl.epc.settings import DATA_ANOMALY_MATCHES as data_anon_matches +from etl.epc.settings import DATA_ANOMALY_MATCHES as data_anon_matches + + class Definitions: """ This class contains some base attributes which are used across multiple other classes """ # Anomalies described here: https://epc.opendatacommunities.org/docs/guidance#glossary - DATA_ANOMALY_MATCHES = { - # Invalid reports are where the value provided is out of bounds, e.g. a negative energy rating of -1199 or a - # non-integer, there is no valid energy band for this, so it is marked as INVALID! - "INVALID", - "INVALID!", - # When the energy certificate was first lodged on the register there was no requirement to lodge this data - # item, i.e. a non-mandatory item. - "NO DATA!", - "NODATA!", - # When the energy certificate was first lodged on the register there was no requirement to lodge this data item, - # i.e.a non - mandatory item. - "N/A", - # A value generated by the register to account for a data item that was not mandatory when the lodgement of - # the energy certificate occurred. When the data item became mandatory the register operator, for backwards - # compatibility purposes, populated the data field with a value of ‘not recorded’ to ensure that the energy - # certificate retrieval process is successfully completed. Mandatory data items cannot be applied - # retrospectively to energy certificates lodged before the date of the change. - "Not recorded", - # The data also contains DECs with an operational rating of ‘9999’ (a ‘default’ DEC). The production of a - # ‘default’ DEC value was allowed to enable building occupiers, with poor quality or no energy data, - # the opportunity to comply with the regulations. From April 2011 the ability to lodge a ‘default’ DEC was no - # longer allowed. - "9999", - # The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER - # was only lodged on the register from 7 March 2010. - "Blank" - # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to - # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) - # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested - # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, - # etc). These records are being published for completeness. An ongoing process to manage these manually added - # addresses will take time to develop to deal with these and future anomalies. - # - # There are several fields within the lodged data where it is possible to enter multiple entries to cater for - # different data_types of build within a single property, i.e. extensions. This results in multiple entries for - # the description fields for floor, roof and wall. For the purposes of this data release only the information - # contained within the first of these multiple entries is being provided. As there are no restrictions on the - # value in this first field it means that sometimes the first field in a multiple entry description field may - # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. - "NULL", - # We sometimes see fields populated with just an empty string. - "", - # An older value which rarely shows up but has been seen in the data. - "UNKNOWN", - } + DATA_ANOMALY_MATCHES = data_anon_matches DATA_ANOMALY_SUBSTRINGS = { # Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index f5fc3582..e2740745 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -1,7 +1,6 @@ from pathlib import Path import numpy as np import pandas as pd -from BaseUtility import Definitions from etl.epc.settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, @@ -22,6 +21,7 @@ from etl.epc.settings import ( ENDING_SUFFIX_COMPONENT_COLS, POTENTIAL_COLUMNS, EFFICIENCY_FEATURES, + DATA_ANOMALY_MATCHES ) from recommendations.rdsap_tables import FLOOR_LEVEL_MAP @@ -247,8 +247,7 @@ class EPCDataProcessor: # Map all anomaly values to None data_anomaly_map = dict( zip( - Definitions.DATA_ANOMALY_MATCHES, - [None] * len(Definitions.DATA_ANOMALY_MATCHES), + DATA_ANOMALY_MATCHES, [None] * len(DATA_ANOMALY_MATCHES), ) ) @@ -283,7 +282,7 @@ class EPCDataProcessor: @staticmethod def clean_construction_age_band(x): # Firstly, we check if it's an error value - if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]: + if x in DATA_ANOMALY_MATCHES or x in [None, np.nan]: return x # Next, we check if it's a value in our map diff --git a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py index 997865d3..0dcf97c5 100644 --- a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py +++ b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py @@ -128,6 +128,7 @@ class MainheatControlAttributes(Definitions): ] def __init__(self, description: str): + self.description: str = clean_description(description.lower()).strip() self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or ( description in self.NO_DATA_DESCRIPTIONS