mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
using stricter version of find_keywords and fixed test:
This commit is contained in:
parent
c2ebb49b18
commit
e1137d3ba7
5 changed files with 36 additions and 15 deletions
|
|
@ -3,4 +3,5 @@ omit =
|
|||
epc_data/tests/*
|
||||
epc_data/temp_inputs.py
|
||||
epc_data/config.py
|
||||
epc_data/__init__.py
|
||||
epc_data/__init__.py
|
||||
epc_data/app.py
|
||||
|
|
@ -58,9 +58,9 @@ def handler():
|
|||
df = df.reset_index(drop=True)
|
||||
|
||||
import numpy as np
|
||||
idx = 1
|
||||
idx = 14
|
||||
record = df[df.index == idx].to_dict("records")[0]
|
||||
record = {k: v for k, v in record.items() if v not in [None, np.nan, False]}
|
||||
record = {k: v for k, v in record.items() if v not in [None, np.nan]}
|
||||
from pprint import pprint
|
||||
pprint(record)
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ class MainFuelAttributes:
|
|||
def __init__(self, description: str):
|
||||
self.description: str = remove_punctuation(clean_description(description.lower()))
|
||||
|
||||
self.is_community = False if 'not community' in self.description else 'community' in self.description
|
||||
self.is_community = 'community' in self.description and 'not community' not in self.description
|
||||
self.is_unknown = False
|
||||
self.nodata = not description
|
||||
|
||||
|
|
@ -63,6 +63,12 @@ class MainFuelAttributes:
|
|||
),
|
||||
}
|
||||
|
||||
# to make this field palettable, if no_individual_heating_or_community_network is populated, we'll
|
||||
# just set it to true
|
||||
result["no_individual_heating_or_community_network"] = bool(
|
||||
result["no_individual_heating_or_community_network"]
|
||||
)
|
||||
|
||||
if not result["fuel_type"]:
|
||||
result["fuel_type"] = self.UNKNOWN_FUEL
|
||||
# We'll do checks on unknown fuel types to ensure we don't miss anything
|
||||
|
|
@ -71,8 +77,23 @@ class MainFuelAttributes:
|
|||
return result
|
||||
|
||||
def _find_keyword(self, keywords):
|
||||
description = self.description
|
||||
|
||||
# Sort keywords by length, longest first.
|
||||
# This ensures that 'time and temperature zone control'
|
||||
# will be checked before 'temperature zone control' if both are present in the keywords list
|
||||
keywords.sort(key=len, reverse=True)
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword in self.description:
|
||||
if keyword in description:
|
||||
return keyword
|
||||
|
||||
# If no keyword is found, try again after removing punctuation
|
||||
description_without_punct = remove_punctuation(description)
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword in description_without_punct:
|
||||
return keyword
|
||||
|
||||
return None
|
||||
|
||||
|
|
|
|||
|
|
@ -104,24 +104,23 @@ class MainheatControlAttributes:
|
|||
return result
|
||||
|
||||
def _find_keyword(self, keywords):
|
||||
description_words = set(self.description.split())
|
||||
description = self.description
|
||||
|
||||
# Sort keywords by length, longest first. This ensures that 'time and temperature zone control'
|
||||
# Sort keywords by length, longest first.
|
||||
# This ensures that 'time and temperature zone control'
|
||||
# will be checked before 'temperature zone control' if both are present in the keywords list
|
||||
keywords.sort(key=len, reverse=True)
|
||||
|
||||
for keyword in keywords:
|
||||
keyword_words = set(keyword.split())
|
||||
if keyword_words.issubset(description_words):
|
||||
if keyword in description:
|
||||
return keyword
|
||||
|
||||
# If no keyword is found, try again after removing punctuation
|
||||
description_without_punct = remove_punctuation(self.description)
|
||||
description_words_without_punct = set(description_without_punct.split())
|
||||
description_without_punct = remove_punctuation(description)
|
||||
|
||||
for keyword in keywords:
|
||||
keyword_words = set(keyword.split())
|
||||
if keyword_words.issubset(description_words_without_punct):
|
||||
if keyword in description_without_punct:
|
||||
return keyword
|
||||
|
||||
return None
|
||||
|
||||
|
|
|
|||
|
|
@ -12,10 +12,10 @@ def test_clean_description():
|
|||
test_cases = [
|
||||
("this:is;a*test", "this is a test"),
|
||||
("hello@world", "hello world"),
|
||||
("what?!?", "what "),
|
||||
("what?!?", "what "),
|
||||
("hello(world)", "hello world "),
|
||||
("", ""),
|
||||
(":;*@?!", " "),
|
||||
(":;*@?!", " "),
|
||||
("no special chars", "no special chars")
|
||||
]
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue