Model/model_data/epc_attributes/attribute_utils.py

import re
import string
from typing import Tuple, Union, Dict, List

THERMAL_TRANSMITTENCE_STR = r"average thermal transmittance (-?\d+\.\d+)\s(w/m-¦k)"
THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTENCE_STR)
DOUBLE_SPACE_PATTERN = re.compile(r"\s+")


def extract_thermal_transmittance(result: dict, description: str) -> Tuple[
    Dict[str, Union[None, str, float]], str
]:
    """
    Extracts thermal transmittance from the description and updates the result dictionary.

    :param result: Dictionary to store the result in.
    :param description: Lowercase description string.
    :return: A tuple containing the updated result dictionary and the description with the thermal transmittance part
    removed.
    """

    match = THERMAL_TRANSMITTANCE_REGEX.search(description)
    if match:
        result['thermal_transmittance'] = float(match.group(1))
        result['thermal_transmittance_unit'] = match.group(2)
        # Remove the match from the description
        description = re.sub(THERMAL_TRANSMITTENCE_STR, "", description)
    else:
        result['thermal_transmittance'] = None
        result['thermal_transmittance_unit'] = None

    return result, description


def extract_component_types(result: dict, description: str, list_of_components: list) -> Tuple[
    Dict[str, Union[None, str, float]], str
]:
    """
    Extracts component data_types from the description, updates the result dictionary, and removes the matched component
    data_types from the description.

    :param result: Dictionary to store the results in.
    :param description: Lowercase description string.
    :param list_of_components: List of component data_types to extract from the description.
    :return: A tuple containing the updated result dictionary and the description with the matched component data_types
    removed.
    """
    for component in list_of_components:
        result[f'is_{component.replace(" ", "_")}'] = component in description
        # Remove the component from the description
        description = description.replace(component, "")

    return result, description


def clean_description(description: str) -> str:
    """
    Clean the description by replacing any special characters with a space.
    """
    special_chars = [":", ";", "*", "@", "?", "!", "(", ")"]
    for char in special_chars:
        description = description.replace(char, " ")

    description = remove_double_spaces(description)
    return description


def switch_chars(description: str) -> str:
    """
    Switches specified characters in a description with a ,
    Useful for descriptions like "Gas: mains gas"
    """

    # Switch : to ,
    chars = [":"]
    for char in chars:
        description = description.replace(char, ",")

    return description


def process_part(result: Dict[str, Union[str, bool]], part: str, attr_list: List[str], prefix: str):
    """
    Process a part of the description with a given list of epc_attributes
    and update the result dictionary.
    """

    if not isinstance(result, dict):
        raise TypeError('Expected a dictionary for result')
    if not isinstance(part, str):
        raise TypeError('Expected a string for part')
    if not isinstance(attr_list, list) or not all(isinstance(i, str) for i in attr_list):
        raise TypeError('Expected a list of strings for attr_list')
    if not isinstance(prefix, str):
        raise TypeError('Expected a string for prefix')

    if not result:
        raise ValueError("Result dictionary cannot be empty")

    if not prefix:
        raise ValueError("Prefix cannot be empty")

    part_words = part.split()
    for attr in attr_list:
        attr_words = attr.split()
        if set(attr_words).issubset(set(part_words)):
            result[f'{prefix}{attr.replace(" ", "_")}'] = True

    at_least_one_attribute_true = any(result.values())
    if not at_least_one_attribute_true:
        raise ValueError("No attribute matches found")

    return result


def remove_punctuation(text: str) -> str:
    # Create a translation table using the string.punctuation string
    translation_table = str.maketrans("", "", string.punctuation)

    # Use the translation table to remove punctuation from the text
    text_without_punctuation = text.translate(translation_table)

    text_without_punctuation = remove_double_spaces(text_without_punctuation)

    text_without_punctuation = text_without_punctuation.strip()

    return text_without_punctuation


def remove_double_spaces(text):
    cleaned_text = DOUBLE_SPACE_PATTERN.sub(" ", text)
    return cleaned_text


def find_keyword(description, keywords, synonyms=None):
    if synonyms is None:
        synonyms = {}

    # Sort keywords by length, longest first.
    # This ensures that 'time and temperature zone control'
    # will be checked before 'temperature zone control' if both are present in the keywords list
    keywords.sort(key=len, reverse=True)

    for keyword in keywords:
        if keyword in description:
            return synonyms.get(keyword, keyword)

    # If no keyword is found, try again after removing punctuation
    description_without_punct = remove_punctuation(description)

    for keyword in keywords:
        if keyword in description_without_punct:
            return synonyms.get(keyword, keyword)

    return None