using stricter version of find_keywords and fixed test:

This commit is contained in:
Khalim Conn-Kowlessar 2023-06-14 19:08:22 +01:00
parent c2ebb49b18
commit e1137d3ba7
5 changed files with 36 additions and 15 deletions

View file

@ -3,4 +3,5 @@ omit =
epc_data/tests/*
epc_data/temp_inputs.py
epc_data/config.py
epc_data/__init__.py
epc_data/__init__.py
epc_data/app.py

View file

@ -58,9 +58,9 @@ def handler():
df = df.reset_index(drop=True)
import numpy as np
idx = 1
idx = 14
record = df[df.index == idx].to_dict("records")[0]
record = {k: v for k, v in record.items() if v not in [None, np.nan, False]}
record = {k: v for k, v in record.items() if v not in [None, np.nan]}
from pprint import pprint
pprint(record)

View file

@ -36,7 +36,7 @@ class MainFuelAttributes:
def __init__(self, description: str):
self.description: str = remove_punctuation(clean_description(description.lower()))
self.is_community = False if 'not community' in self.description else 'community' in self.description
self.is_community = 'community' in self.description and 'not community' not in self.description
self.is_unknown = False
self.nodata = not description
@ -63,6 +63,12 @@ class MainFuelAttributes:
),
}
# to make this field palettable, if no_individual_heating_or_community_network is populated, we'll
# just set it to true
result["no_individual_heating_or_community_network"] = bool(
result["no_individual_heating_or_community_network"]
)
if not result["fuel_type"]:
result["fuel_type"] = self.UNKNOWN_FUEL
# We'll do checks on unknown fuel types to ensure we don't miss anything
@ -71,8 +77,23 @@ class MainFuelAttributes:
return result
def _find_keyword(self, keywords):
description = self.description
# Sort keywords by length, longest first.
# This ensures that 'time and temperature zone control'
# will be checked before 'temperature zone control' if both are present in the keywords list
keywords.sort(key=len, reverse=True)
for keyword in keywords:
if keyword in self.description:
if keyword in description:
return keyword
# If no keyword is found, try again after removing punctuation
description_without_punct = remove_punctuation(description)
for keyword in keywords:
if keyword in description_without_punct:
return keyword
return None

View file

@ -104,24 +104,23 @@ class MainheatControlAttributes:
return result
def _find_keyword(self, keywords):
description_words = set(self.description.split())
description = self.description
# Sort keywords by length, longest first. This ensures that 'time and temperature zone control'
# Sort keywords by length, longest first.
# This ensures that 'time and temperature zone control'
# will be checked before 'temperature zone control' if both are present in the keywords list
keywords.sort(key=len, reverse=True)
for keyword in keywords:
keyword_words = set(keyword.split())
if keyword_words.issubset(description_words):
if keyword in description:
return keyword
# If no keyword is found, try again after removing punctuation
description_without_punct = remove_punctuation(self.description)
description_words_without_punct = set(description_without_punct.split())
description_without_punct = remove_punctuation(description)
for keyword in keywords:
keyword_words = set(keyword.split())
if keyword_words.issubset(description_words_without_punct):
if keyword in description_without_punct:
return keyword
return None

View file

@ -12,10 +12,10 @@ def test_clean_description():
test_cases = [
("this:is;a*test", "this is a test"),
("hello@world", "hello world"),
("what?!?", "what "),
("what?!?", "what "),
("hello(world)", "hello world "),
("", ""),
(":;*@?!", " "),
(":;*@?!", " "),
("no special chars", "no special chars")
]