From 6a48eea55c686f62b24a3837d46fc9de52e40b52 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 14 Jun 2023 19:12:06 +0100 Subject: [PATCH] Implemented stricter find_keywords and made it a util --- epc_data/attributes/MainFuelAttributes.py | 32 +++------------ .../attributes/MainheatControlAttributes.py | 41 ++++--------------- epc_data/attributes/attribute_utils.py | 20 +++++++++ 3 files changed, 34 insertions(+), 59 deletions(-) diff --git a/epc_data/attributes/MainFuelAttributes.py b/epc_data/attributes/MainFuelAttributes.py index f27330c4..b54c35ac 100644 --- a/epc_data/attributes/MainFuelAttributes.py +++ b/epc_data/attributes/MainFuelAttributes.py @@ -1,5 +1,5 @@ from typing import Dict, Union -from epc_data.attributes.attribute_utils import clean_description, remove_punctuation +from epc_data.attributes.attribute_utils import clean_description, remove_punctuation, find_keyword class MainFuelAttributes: @@ -55,11 +55,11 @@ class MainFuelAttributes: def process(self) -> Dict[str, Union[str, bool]]: result: Dict[str, Union[str, bool]] = { - "fuel_type": self._find_keyword(self.FUEL_KEYWORDS), - "tariff_type": self._find_keyword(self.TARIFF_KEYWORDS), + "fuel_type": find_keyword(self.description, self.FUEL_KEYWORDS), + "tariff_type": find_keyword(self.description, self.TARIFF_KEYWORDS), "is_community": self.is_community, - "no_individual_heating_or_community_network": self._find_keyword( - self.NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK + "no_individual_heating_or_community_network": find_keyword( + self.description, self.NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK ), } @@ -75,25 +75,3 @@ class MainFuelAttributes: self.is_unknown = True return result - - def _find_keyword(self, keywords): - description = self.description - - # Sort keywords by length, longest first. - # This ensures that 'time and temperature zone control' - # will be checked before 'temperature zone control' if both are present in the keywords list - keywords.sort(key=len, reverse=True) - - for keyword in keywords: - if keyword in description: - return keyword - - # If no keyword is found, try again after removing punctuation - description_without_punct = remove_punctuation(description) - - for keyword in keywords: - if keyword in description_without_punct: - return keyword - - return None - diff --git a/epc_data/attributes/MainheatControlAttributes.py b/epc_data/attributes/MainheatControlAttributes.py index 4bd23bb4..622d6e05 100644 --- a/epc_data/attributes/MainheatControlAttributes.py +++ b/epc_data/attributes/MainheatControlAttributes.py @@ -1,5 +1,5 @@ from typing import Dict, Union -from epc_data.attributes.attribute_utils import clean_description, remove_punctuation +from epc_data.attributes.attribute_utils import clean_description, remove_punctuation, find_keyword class MainheatControlAttributes: @@ -86,41 +86,18 @@ class MainheatControlAttributes: return any(keyword in self.description for keyword in keywords) def process(self) -> Dict[str, Union[str, bool]]: - result: Dict[str, Union[str, bool]] = { - "thermostatic_control": self._find_keyword(self.THERMOSTATIC_CONTROL_KEYWORDS), - "charging_system": self._find_keyword(self.CHARGING_SYSTEM_KEYWORDS), - "switch_system": self._find_keyword(self.SWITCH_SYSTEM_KEYWORDS), - "no_control": self._find_keyword(self.NO_CONTROL_SYSTEM_KEYWORDS), - "dhw_control": self._find_keyword(self.DHW_CONTROL_KEYWORDS), - "community_heating": self._find_keyword(self.COMMUNITY_HEATING_KEYWORDS), + "thermostatic_control": find_keyword(self.description, self.THERMOSTATIC_CONTROL_KEYWORDS), + "charging_system": find_keyword(self.description, self.CHARGING_SYSTEM_KEYWORDS), + "switch_system": find_keyword(self.description, self.SWITCH_SYSTEM_KEYWORDS), + "no_control": find_keyword(self.description, self.NO_CONTROL_SYSTEM_KEYWORDS), + "dhw_control": find_keyword(self.description, self.DHW_CONTROL_KEYWORDS), + "community_heating": find_keyword(self.description, self.COMMUNITY_HEATING_KEYWORDS), "multiple_room_thermostats": any( phrase in self.description for phrase in self.MULTIPLE_ROOM_THERMOSTATS_PHRASES ), - "auxiliary_systems": self._find_keyword(self.AUXILIARY_SYSTEM_KEYWORDS), - "trvs": self._find_keyword(self.TRVS_KEYWORDS) + "auxiliary_systems": find_keyword(self.description, self.AUXILIARY_SYSTEM_KEYWORDS), + "trvs": find_keyword(self.description, self.TRVS_KEYWORDS) } return result - - def _find_keyword(self, keywords): - description = self.description - - # Sort keywords by length, longest first. - # This ensures that 'time and temperature zone control' - # will be checked before 'temperature zone control' if both are present in the keywords list - keywords.sort(key=len, reverse=True) - - for keyword in keywords: - if keyword in description: - return keyword - - # If no keyword is found, try again after removing punctuation - description_without_punct = remove_punctuation(description) - - for keyword in keywords: - if keyword in description_without_punct: - return keyword - - return None - diff --git a/epc_data/attributes/attribute_utils.py b/epc_data/attributes/attribute_utils.py index 25cc9571..92da41e9 100644 --- a/epc_data/attributes/attribute_utils.py +++ b/epc_data/attributes/attribute_utils.py @@ -114,3 +114,23 @@ def remove_punctuation(text: str) -> str: def remove_double_spaces(text): cleaned_text = DOUBLE_SPACE_PATTERN.sub(" ", text) return cleaned_text + + +def find_keyword(description, keywords): + # Sort keywords by length, longest first. + # This ensures that 'time and temperature zone control' + # will be checked before 'temperature zone control' if both are present in the keywords list + keywords.sort(key=len, reverse=True) + + for keyword in keywords: + if keyword in description: + return keyword + + # If no keyword is found, try again after removing punctuation + description_without_punct = remove_punctuation(description) + + for keyword in keywords: + if keyword in description_without_punct: + return keyword + + return None