diff --git a/model_data/BaseUtility.py b/model_data/BaseUtility.py new file mode 100644 index 00000000..2474e517 --- /dev/null +++ b/model_data/BaseUtility.py @@ -0,0 +1,54 @@ +class BaseUtility: + """ + This class contains some base attributes which are used across multiple other classes + """ + + # Anomalies described here: https://epc.opendatacommunities.org/docs/guidance#glossary + DATA_ANOMALY_MATCHES = { + # Invalid reports are where the value provided is out of bounds, e.g. a negative energy rating of -1199 or a + # non-integer, there is no valid energy band for this, so it is marked as INVALID! + "INVALID", + # When the energy certificate was first lodged on the register there was no requirement to lodge this data + # item, i.e. a non-mandatory item. + "NO DATA!", + # When the energy certificate was first lodged on the register there was no requirement to lodge this data item, + # i.e.a non - mandatory item. + "N/A", + # A value generated by the register to account for a data item that was not mandatory when the lodgement of + # the energy certificate occurred. When the data item became mandatory the register operator, for backwards + # compatibility purposes, populated the data field with a value of ‘not recorded’ to ensure that the energy + # certificate retrieval process is successfully completed. Mandatory data items cannot be applied + # retrospectively to energy certificates lodged before the date of the change. + "Not recorded", + # The data also contains DECs with an operational rating of ‘9999’ (a ‘default’ DEC). The production of a + # ‘default’ DEC value was allowed to enable building occupiers, with poor quality or no energy data, + # the opportunity to comply with the regulations. From April 2011 the ability to lodge a ‘default’ DEC was no + # longer allowed. + "9999", + # The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER + # was only lodged on the register from 7 March 2010. + "Blank" + # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to + # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) + # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested + # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, + # etc). These records are being published for completeness. An ongoing process to manage these manually added + # addresses will take time to develop to deal with these and future anomalies. + # + # There are several fields within the lodged data where it is possible to enter multiple entries to cater for + # different types of build within a single property, i.e. extensions. This results in multiple entries for + # the description fields for floor, roof and wall. For the purposes of this data release only the information + # contained within the first of these multiple entries is being provided. As there are no restrictions on the + # value in this first field it means that sometimes the first field in a multiple entry description field may + # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. + "NULL" + } + + DATA_ANOMALY_SUBSTRINGS = { + # Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for + # ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained + # but ‘for backward compatibility only’ it is appended to ensure that the energy certificate retrieval + # process can be successfully completed. Replacement data items cannot be applied retrospectively to energy + # certificates lodged on the register before the date of the change. + "for backward compatibility only" + } diff --git a/model_data/Property.py b/model_data/Property.py index 0ff448c4..1de3577b 100644 --- a/model_data/Property.py +++ b/model_data/Property.py @@ -2,9 +2,10 @@ from epc_api.client import EpcClient from model_data.config import EPC_AUTH_TOKEN from model_data.OpenUprnClient import OpenUprnClient from model_data.EpcClean import EpcClean +from model_data.BaseUtility import BaseUtility -class Property: +class Property(BaseUtility): ATTRIBUTE_MAP = { "floor-description": "floor", "hotwater-description": "hotwater", @@ -91,6 +92,10 @@ class Property: for description, attribute in cleaner.cleaned.items(): + if self.data[description] in self.DATA_ANOMALY_MATCHES: + setattr(self, self.ATTRIBUTE_MAP[description], {"original_description": self.data[description]}) + continue + attributes = [ x for x in cleaner.cleaned[description] if x["original_description"] == self.data[description] ] diff --git a/model_data/app.py b/model_data/app.py index 29ca8cab..62d4fc4b 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -1,5 +1,3 @@ -import pandas as pd -from pprint import pprint from tqdm import tqdm import os from model_data.BoreholeClient import BoreholeClient @@ -54,8 +52,6 @@ def handler(): for p in input_properties: p.get_coordinates(open_uprn_client) - pprint(input_properties[0].coordinates) - local_authorities = {p.data['local-authority'] for p in input_properties} data = [] @@ -69,19 +65,10 @@ def handler(): ) ) - cleaner = EpcClean(data) - + # Incorporate input data into cleaning + cleaner = EpcClean(data + [p.data for p in input_properties]) cleaner.clean() - # example cleaned data - # Why do we need this stuff? - # https://docs.google.com/spreadsheets/d/1ek9ItDv7xHwFm_FK6B0PyOBwvi6U4qRPuncBsVlCHUA/edit#gid=0 - cleaner.cleaned.keys() - floors = pd.DataFrame(cleaner.cleaned['floor-description']) - walls = pd.DataFrame(cleaner.cleaned['walls-description']) - hotwater = pd.DataFrame(cleaner.cleaned['hotwater-description']) - mainheat = pd.DataFrame(cleaner.cleaned["mainheat-description"]) - address_meta = [ { "postcode": x["postcode"].upper(), @@ -110,3 +97,5 @@ def handler(): # on the cleaning we've done for p in input_properties: p.get_components(cleaner) + + # Now, given the components, we want to idenfity upgrade options diff --git a/model_data/epc_attributes/MainFuelAttributes.py b/model_data/epc_attributes/MainFuelAttributes.py index fa3c4618..238c53ab 100644 --- a/model_data/epc_attributes/MainFuelAttributes.py +++ b/model_data/epc_attributes/MainFuelAttributes.py @@ -1,8 +1,9 @@ from typing import Dict, Union +from model_data.BaseUtility import BaseUtility from model_data.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword -class MainFuelAttributes: +class MainFuelAttributes(BaseUtility): FUEL_KEYWORDS = [ 'heat network', 'mains gas', @@ -42,7 +43,7 @@ class MainFuelAttributes: self.is_community = 'community' in self.description and 'not community' not in self.description self.is_unknown = False - self.nodata = not description + self.nodata = not description or description in self.DATA_ANOMALY_MATCHES if not self.nodata and not any( self._keyword_in_description(keywords) @@ -59,6 +60,17 @@ class MainFuelAttributes: return any(keyword in self.description for keyword in keywords) def process(self) -> Dict[str, Union[str, bool]]: + + if self.nodata: + result = { + "fuel_type": None, + "tariff_type": None, + "is_community": False, + "no_individual_heating_or_community_network": False, + "complex_fuel_type": False + } + return result + result: Dict[str, Union[str, bool]] = { "fuel_type": find_keyword(self.description, self.FUEL_KEYWORDS), "tariff_type": find_keyword(self.description, self.TARIFF_KEYWORDS),