From c2ebb49b18e0468af42692ce7bf38ba94d2d9ef9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 14 Jun 2023 18:53:37 +0100 Subject: [PATCH] added separate detection of tariff --- epc_data/app.py | 2 +- epc_data/attributes/MainFuelAttributes.py | 35 +++++++++++++++++------ epc_data/attributes/attribute_utils.py | 10 +++++++ 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/epc_data/app.py b/epc_data/app.py index f9d21beb..09f2b60c 100644 --- a/epc_data/app.py +++ b/epc_data/app.py @@ -58,7 +58,7 @@ def handler(): df = df.reset_index(drop=True) import numpy as np - idx = 36 + idx = 1 record = df[df.index == idx].to_dict("records")[0] record = {k: v for k, v in record.items() if v not in [None, np.nan, False]} from pprint import pprint diff --git a/epc_data/attributes/MainFuelAttributes.py b/epc_data/attributes/MainFuelAttributes.py index 78dd6ba6..8f725d09 100644 --- a/epc_data/attributes/MainFuelAttributes.py +++ b/epc_data/attributes/MainFuelAttributes.py @@ -11,32 +11,41 @@ class MainFuelAttributes: 'biomass', 'biodiesel', # Note: there is als a category called 'bottled LPG', but only 2/50k entries had this - 'LPG', + 'lpg', 'waste combustion', 'biogas', 'wood logs', - 'dual fuel - mineral + wood', + 'dual fuel mineral wood', 'gas', 'anthracite', 'smokeless coal', 'house coal' ] - NO_FUEL_KEYWORDS = [ - 'To be used only when there is no heating/hot-water system or data is from a community network' + TARIFF_KEYWORDS = [ + 'unspecified tariff' + # We may come across more later but this is all observed for now + ] + + UNKNOWN_FUEL = "unknown" + + NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK = [ + 'to be used only when there is no heatinghotwater system or data is from a community network' ] def __init__(self, description: str): self.description: str = remove_punctuation(clean_description(description.lower())) - self.nodata = not description or any(keyword in self.description for keyword in self.NO_FUEL_KEYWORDS) - self.is_community = False if 'not community' in self.description else 'community' in self.description + self.is_unknown = False + self.nodata = not description if not self.nodata and not any( self._keyword_in_description(keywords) for keywords in [ - self.FUEL_KEYWORDS + self.FUEL_KEYWORDS, + self.NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK, + self.TARIFF_KEYWORDS ] ): raise ValueError('Invalid description') @@ -47,10 +56,18 @@ class MainFuelAttributes: def process(self) -> Dict[str, Union[str, bool]]: result: Dict[str, Union[str, bool]] = { "fuel_type": self._find_keyword(self.FUEL_KEYWORDS), - "no_fuel": self._find_keyword(self.NO_FUEL_KEYWORDS), - "is_community": self.is_community + "tariff_type": self._find_keyword(self.TARIFF_KEYWORDS), + "is_community": self.is_community, + "no_individual_heating_or_community_network": self._find_keyword( + self.NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK + ), } + if not result["fuel_type"]: + result["fuel_type"] = self.UNKNOWN_FUEL + # We'll do checks on unknown fuel types to ensure we don't miss anything + self.is_unknown = True + return result def _find_keyword(self, keywords): diff --git a/epc_data/attributes/attribute_utils.py b/epc_data/attributes/attribute_utils.py index 7bdd4288..25cc9571 100644 --- a/epc_data/attributes/attribute_utils.py +++ b/epc_data/attributes/attribute_utils.py @@ -4,6 +4,7 @@ from typing import Tuple, Union, Dict, List THERMAL_TRANSMITTENCE_STR = r"average thermal transmittance (-?\d+\.\d+)\s(w/m-¦k)" THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTENCE_STR) +DOUBLE_SPACE_PATTERN = re.compile(r"\s+") def extract_thermal_transmittance(result: dict, description: str) -> Tuple[ @@ -59,6 +60,8 @@ def clean_description(description: str) -> str: special_chars = [":", ";", "*", "@", "?", "!", "(", ")"] for char in special_chars: description = description.replace(char, " ") + + description = remove_double_spaces(description) return description @@ -103,4 +106,11 @@ def remove_punctuation(text: str) -> str: # Use the translation table to remove punctuation from the text text_without_punctuation = text.translate(translation_table) + text_without_punctuation = remove_double_spaces(text_without_punctuation) + return text_without_punctuation + + +def remove_double_spaces(text): + cleaned_text = DOUBLE_SPACE_PATTERN.sub(" ", text) + return cleaned_text