broke out clean_description

This commit is contained in:
Khalim Conn-Kowlessar 2023-06-14 10:23:27 +01:00
parent 363c0745fe
commit d10bed5b6d
5 changed files with 71 additions and 13 deletions

View file

@ -41,7 +41,7 @@ def handler():
# For testing:
from epc_data.attributes.MainheatAttributes import MainHeatAttributes
descriptions = {x["mainheat-description"] for x in data}
descriptions = {x["windows-description"] for x in data}
out = []
for description in descriptions:
res = MainHeatAttributes(description).process()

View file

@ -1,3 +1,4 @@
from epc_data.attributes.attribute_utils import clean_description
from typing import Dict, List, Union
@ -13,7 +14,7 @@ class MainHeatAttributes:
OTHERS = ["assumed", "electricaire", "assumed for most rooms"]
def __init__(self, description: str):
self.description: str = self._clean_description(description.lower())
self.description: str = clean_description(description.lower())
# Remove special characters
if not description or not any(
@ -22,16 +23,6 @@ class MainHeatAttributes:
):
raise ValueError('Invalid description')
@staticmethod
def _clean_description(description: str) -> str:
"""
Clean the description by replacing any special characters with a space.
"""
special_chars = [":", ";", "*", "@", "?", "!", "(", ")"]
for char in special_chars:
description = description.replace(char, " ")
return description
def process(self) -> Dict[str, Union[str, bool]]:
result: Dict[str, Union[str, bool]] = {f'has_{ds.replace(" ", "_")}': False for ds in self.DISTRIBUTION_SYSTEMS}

View file

@ -0,0 +1,42 @@
from typing import Dict, List, Union
class WindowAttributes:
GLAZING_KEYWORDS = ["glazing", "glazed", "glaze"]
GLAZING_COVERAGE = ["fully", "mostly", "partial", "some"]
GLAZING_TYPES = ["double", "triple", "secondary", "multiple", "high performance"]
def __init__(self, description: str):
self.description: str = self._clean_description(description.lower())
if not description or not any(
rt in self.description for rt in
self.GLAZING_KEYWORDS + self.GLAZING_COVERAGE + self.GLAZING_TYPES
):
raise ValueError('Invalid description')
def process(self) -> Dict[str, Union[str, bool]]:
result: Dict[str, Union[str, bool]] = {f'has_{wt.replace(" ", "_")}': False for wt in self.WINDOW_TYPES}
description = self.description.split(',')
# Process each part separately
for part in description:
part = part.strip() # remove leading/trailing white spaces
self._process_part(result, part, self.WINDOW_TYPES, 'has_')
return result
@staticmethod
def _process_part(result: Dict[str, Union[str, bool]], part: str, attr_list: List[str], prefix: str):
"""
Process a part of the description with a given list of attributes
and update the result dictionary.
"""
part_words = part.split()
for attr in attr_list:
attr_words = attr.split()
if set(attr_words).issubset(set(part_words)):
result[f'{prefix}{attr.replace(" ", "_")}'] = True
return result

View file

@ -49,3 +49,13 @@ def extract_component_types(result: dict, description: str, list_of_components:
description = description.replace(component, "")
return result, description
def clean_description(description: str) -> str:
"""
Clean the description by replacing any special characters with a space.
"""
special_chars = [":", ";", "*", "@", "?", "!", "(", ")"]
for char in special_chars:
description = description.replace(char, " ")
return description

View file

@ -1,7 +1,22 @@
from epc_data.attributes.attribute_utils import extract_thermal_transmittance
from epc_data.attributes.attribute_utils import extract_thermal_transmittance, clean_description
def test_extract_thermal_transmittance():
description = "average thermal transmittance 2.3 w/m-¦k"
assert extract_thermal_transmittance({}, description) == (
{'thermal_transmittance': 2.3, 'thermal_transmittance_unit': 'w/m-¦k'}, '')
def test_clean_description():
test_cases = [
("this:is;a*test", "this is a test"),
("hello@world", "hello world"),
("what?!?", "what "),
("hello(world)", "hello world "),
("", ""),
(":;*@?!", " "),
("no special chars", "no special chars")
]
for input_str, expected_output in test_cases:
assert clean_description(input_str) == expected_output