From e1137d3ba75034d6d4e32d26b60457c45c9a3d77 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 14 Jun 2023 19:08:22 +0100 Subject: [PATCH] using stricter version of find_keywords and fixed test: --- .coveragerc | 3 ++- epc_data/app.py | 4 +-- epc_data/attributes/MainFuelAttributes.py | 25 +++++++++++++++++-- .../attributes/MainheatControlAttributes.py | 15 ++++++----- epc_data/tests/test_attribute_utils.py | 4 +-- 5 files changed, 36 insertions(+), 15 deletions(-) diff --git a/.coveragerc b/.coveragerc index 8ce77580..f1ad13c2 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,4 +3,5 @@ omit = epc_data/tests/* epc_data/temp_inputs.py epc_data/config.py - epc_data/__init__.py \ No newline at end of file + epc_data/__init__.py + epc_data/app.py \ No newline at end of file diff --git a/epc_data/app.py b/epc_data/app.py index 09f2b60c..f290488e 100644 --- a/epc_data/app.py +++ b/epc_data/app.py @@ -58,9 +58,9 @@ def handler(): df = df.reset_index(drop=True) import numpy as np - idx = 1 + idx = 14 record = df[df.index == idx].to_dict("records")[0] - record = {k: v for k, v in record.items() if v not in [None, np.nan, False]} + record = {k: v for k, v in record.items() if v not in [None, np.nan]} from pprint import pprint pprint(record) diff --git a/epc_data/attributes/MainFuelAttributes.py b/epc_data/attributes/MainFuelAttributes.py index 8f725d09..f27330c4 100644 --- a/epc_data/attributes/MainFuelAttributes.py +++ b/epc_data/attributes/MainFuelAttributes.py @@ -36,7 +36,7 @@ class MainFuelAttributes: def __init__(self, description: str): self.description: str = remove_punctuation(clean_description(description.lower())) - self.is_community = False if 'not community' in self.description else 'community' in self.description + self.is_community = 'community' in self.description and 'not community' not in self.description self.is_unknown = False self.nodata = not description @@ -63,6 +63,12 @@ class MainFuelAttributes: ), } + # to make this field palettable, if no_individual_heating_or_community_network is populated, we'll + # just set it to true + result["no_individual_heating_or_community_network"] = bool( + result["no_individual_heating_or_community_network"] + ) + if not result["fuel_type"]: result["fuel_type"] = self.UNKNOWN_FUEL # We'll do checks on unknown fuel types to ensure we don't miss anything @@ -71,8 +77,23 @@ class MainFuelAttributes: return result def _find_keyword(self, keywords): + description = self.description + + # Sort keywords by length, longest first. + # This ensures that 'time and temperature zone control' + # will be checked before 'temperature zone control' if both are present in the keywords list + keywords.sort(key=len, reverse=True) + for keyword in keywords: - if keyword in self.description: + if keyword in description: + return keyword + + # If no keyword is found, try again after removing punctuation + description_without_punct = remove_punctuation(description) + + for keyword in keywords: + if keyword in description_without_punct: return keyword return None + diff --git a/epc_data/attributes/MainheatControlAttributes.py b/epc_data/attributes/MainheatControlAttributes.py index be792c0b..4bd23bb4 100644 --- a/epc_data/attributes/MainheatControlAttributes.py +++ b/epc_data/attributes/MainheatControlAttributes.py @@ -104,24 +104,23 @@ class MainheatControlAttributes: return result def _find_keyword(self, keywords): - description_words = set(self.description.split()) + description = self.description - # Sort keywords by length, longest first. This ensures that 'time and temperature zone control' + # Sort keywords by length, longest first. + # This ensures that 'time and temperature zone control' # will be checked before 'temperature zone control' if both are present in the keywords list keywords.sort(key=len, reverse=True) for keyword in keywords: - keyword_words = set(keyword.split()) - if keyword_words.issubset(description_words): + if keyword in description: return keyword # If no keyword is found, try again after removing punctuation - description_without_punct = remove_punctuation(self.description) - description_words_without_punct = set(description_without_punct.split()) + description_without_punct = remove_punctuation(description) for keyword in keywords: - keyword_words = set(keyword.split()) - if keyword_words.issubset(description_words_without_punct): + if keyword in description_without_punct: return keyword return None + diff --git a/epc_data/tests/test_attribute_utils.py b/epc_data/tests/test_attribute_utils.py index 26cea6f9..8e38f8b8 100644 --- a/epc_data/tests/test_attribute_utils.py +++ b/epc_data/tests/test_attribute_utils.py @@ -12,10 +12,10 @@ def test_clean_description(): test_cases = [ ("this:is;a*test", "this is a test"), ("hello@world", "hello world"), - ("what?!?", "what "), + ("what?!?", "what "), ("hello(world)", "hello world "), ("", ""), - (":;*@?!", " "), + (":;*@?!", " "), ("no special chars", "no special chars") ]