import re import string from typing import Tuple, Union, Dict, List THERMAL_TRANSMITTENCE_STR = r"average thermal transmittance (-?\d+\.\d+)\s(w/m-¦k)" THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTENCE_STR) DOUBLE_SPACE_PATTERN = re.compile(r"\s+") def extract_thermal_transmittance(result: dict, description: str) -> Tuple[ Dict[str, Union[None, str, float]], str ]: """ Extracts thermal transmittance from the description and updates the result dictionary. :param result: Dictionary to store the result in. :param description: Lowercase description string. :return: A tuple containing the updated result dictionary and the description with the thermal transmittance part removed. """ match = THERMAL_TRANSMITTANCE_REGEX.search(description) if match: result['thermal_transmittance'] = float(match.group(1)) result['thermal_transmittance_unit'] = match.group(2) # Remove the match from the description description = re.sub(THERMAL_TRANSMITTENCE_STR, "", description) else: result['thermal_transmittance'] = None result['thermal_transmittance_unit'] = None return result, description def extract_component_types(result: dict, description: str, list_of_components: list) -> Tuple[ Dict[str, Union[None, str, float]], str ]: """ Extracts component types from the description, updates the result dictionary, and removes the matched component types from the description. :param result: Dictionary to store the results in. :param description: Lowercase description string. :param list_of_components: List of component types to extract from the description. :return: A tuple containing the updated result dictionary and the description with the matched component types removed. """ for component in list_of_components: result[f'is_{component.replace(" ", "_")}'] = component in description # Remove the component from the description description = description.replace(component, "") return result, description def clean_description(description: str) -> str: """ Clean the description by replacing any special characters with a space. """ special_chars = [":", ";", "*", "@", "?", "!", "(", ")"] for char in special_chars: description = description.replace(char, " ") description = remove_double_spaces(description) return description def process_part(result: Dict[str, Union[str, bool]], part: str, attr_list: List[str], prefix: str): """ Process a part of the description with a given list of epc_attributes and update the result dictionary. """ if not isinstance(result, dict): raise TypeError('Expected a dictionary for result') if not isinstance(part, str): raise TypeError('Expected a string for part') if not isinstance(attr_list, list) or not all(isinstance(i, str) for i in attr_list): raise TypeError('Expected a list of strings for attr_list') if not isinstance(prefix, str): raise TypeError('Expected a string for prefix') if not result: raise ValueError("Result dictionary cannot be empty") if not prefix: raise ValueError("Prefix cannot be empty") part_words = part.split() for attr in attr_list: attr_words = attr.split() if set(attr_words).issubset(set(part_words)): result[f'{prefix}{attr.replace(" ", "_")}'] = True at_least_one_attribute_true = any(result.values()) if not at_least_one_attribute_true: raise ValueError("No attribute matches found") return result def remove_punctuation(text: str) -> str: # Create a translation table using the string.punctuation string translation_table = str.maketrans("", "", string.punctuation) # Use the translation table to remove punctuation from the text text_without_punctuation = text.translate(translation_table) text_without_punctuation = remove_double_spaces(text_without_punctuation) text_without_punctuation = text_without_punctuation.strip() return text_without_punctuation def remove_double_spaces(text): cleaned_text = DOUBLE_SPACE_PATTERN.sub(" ", text) return cleaned_text def find_keyword(description, keywords, synonyms=None): if synonyms is None: synonyms = {} # Sort keywords by length, longest first. # This ensures that 'time and temperature zone control' # will be checked before 'temperature zone control' if both are present in the keywords list keywords.sort(key=len, reverse=True) for keyword in keywords: if keyword in description: return synonyms.get(keyword, keyword) # If no keyword is found, try again after removing punctuation description_without_punct = remove_punctuation(description) for keyword in keywords: if keyword in description_without_punct: return synonyms.get(keyword, keyword) return None