From e1137d3ba75034d6d4e32d26b60457c45c9a3d77 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 14 Jun 2023 19:08:22 +0100
Subject: [PATCH] using stricter version of find_keywords and fixed test:

---
 .coveragerc                                   |  3 ++-
 epc_data/app.py                               |  4 +--
 epc_data/attributes/MainFuelAttributes.py     | 25 +++++++++++++++++--
 .../attributes/MainheatControlAttributes.py   | 15 ++++++-----
 epc_data/tests/test_attribute_utils.py        |  4 +--
 5 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index 8ce77580..f1ad13c2 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -3,4 +3,5 @@ omit =
     epc_data/tests/*
     epc_data/temp_inputs.py
     epc_data/config.py
-    epc_data/__init__.py
\ No newline at end of file
+    epc_data/__init__.py
+    epc_data/app.py
\ No newline at end of file
diff --git a/epc_data/app.py b/epc_data/app.py
index 09f2b60c..f290488e 100644
--- a/epc_data/app.py
+++ b/epc_data/app.py
@@ -58,9 +58,9 @@ def handler():
     df = df.reset_index(drop=True)
 
     import numpy as np
-    idx = 1
+    idx = 14
     record = df[df.index == idx].to_dict("records")[0]
-    record = {k: v for k, v in record.items() if v not in [None, np.nan, False]}
+    record = {k: v for k, v in record.items() if v not in [None, np.nan]}
     from pprint import pprint
     pprint(record)
 
diff --git a/epc_data/attributes/MainFuelAttributes.py b/epc_data/attributes/MainFuelAttributes.py
index 8f725d09..f27330c4 100644
--- a/epc_data/attributes/MainFuelAttributes.py
+++ b/epc_data/attributes/MainFuelAttributes.py
@@ -36,7 +36,7 @@ class MainFuelAttributes:
     def __init__(self, description: str):
         self.description: str = remove_punctuation(clean_description(description.lower()))
 
-        self.is_community = False if 'not community' in self.description else 'community' in self.description
+        self.is_community = 'community' in self.description and 'not community' not in self.description
         self.is_unknown = False
         self.nodata = not description
 
@@ -63,6 +63,12 @@ class MainFuelAttributes:
             ),
         }
 
+        # to make this field palettable, if no_individual_heating_or_community_network is populated, we'll
+        # just set it to true
+        result["no_individual_heating_or_community_network"] = bool(
+            result["no_individual_heating_or_community_network"]
+        )
+
         if not result["fuel_type"]:
             result["fuel_type"] = self.UNKNOWN_FUEL
             # We'll do checks on unknown fuel types to ensure we don't miss anything
@@ -71,8 +77,23 @@ class MainFuelAttributes:
         return result
 
     def _find_keyword(self, keywords):
+        description = self.description
+
+        # Sort keywords by length, longest first.
+        # This ensures that 'time and temperature zone control'
+        # will be checked before 'temperature zone control' if both are present in the keywords list
+        keywords.sort(key=len, reverse=True)
+
         for keyword in keywords:
-            if keyword in self.description:
+            if keyword in description:
+                return keyword
+
+        # If no keyword is found, try again after removing punctuation
+        description_without_punct = remove_punctuation(description)
+
+        for keyword in keywords:
+            if keyword in description_without_punct:
                 return keyword
 
         return None
+
diff --git a/epc_data/attributes/MainheatControlAttributes.py b/epc_data/attributes/MainheatControlAttributes.py
index be792c0b..4bd23bb4 100644
--- a/epc_data/attributes/MainheatControlAttributes.py
+++ b/epc_data/attributes/MainheatControlAttributes.py
@@ -104,24 +104,23 @@ class MainheatControlAttributes:
         return result
 
     def _find_keyword(self, keywords):
-        description_words = set(self.description.split())
+        description = self.description
 
-        # Sort keywords by length, longest first. This ensures that 'time and temperature zone control'
+        # Sort keywords by length, longest first.
+        # This ensures that 'time and temperature zone control'
         # will be checked before 'temperature zone control' if both are present in the keywords list
         keywords.sort(key=len, reverse=True)
 
         for keyword in keywords:
-            keyword_words = set(keyword.split())
-            if keyword_words.issubset(description_words):
+            if keyword in description:
                 return keyword
 
         # If no keyword is found, try again after removing punctuation
-        description_without_punct = remove_punctuation(self.description)
-        description_words_without_punct = set(description_without_punct.split())
+        description_without_punct = remove_punctuation(description)
 
         for keyword in keywords:
-            keyword_words = set(keyword.split())
-            if keyword_words.issubset(description_words_without_punct):
+            if keyword in description_without_punct:
                 return keyword
 
         return None
+
diff --git a/epc_data/tests/test_attribute_utils.py b/epc_data/tests/test_attribute_utils.py
index 26cea6f9..8e38f8b8 100644
--- a/epc_data/tests/test_attribute_utils.py
+++ b/epc_data/tests/test_attribute_utils.py
@@ -12,10 +12,10 @@ def test_clean_description():
     test_cases = [
         ("this:is;a*test", "this is a test"),
         ("hello@world", "hello world"),
-        ("what?!?", "what   "),
+        ("what?!?", "what "),
         ("hello(world)", "hello world "),
         ("", ""),
-        (":;*@?!", "      "),
+        (":;*@?!", " "),
         ("no special chars", "no special chars")
     ]