From 7c4d115f5f7e49a07d2a262eb422158aaf099a74 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 13 Jun 2023 10:26:10 +0100 Subject: [PATCH] refactored WallAttributes --- epc_data/app.py | 2 +- epc_data/attributes/WallAttributes.py | 213 +++++--------------------- 2 files changed, 41 insertions(+), 174 deletions(-) diff --git a/epc_data/app.py b/epc_data/app.py index ace7243f..bffb4092 100644 --- a/epc_data/app.py +++ b/epc_data/app.py @@ -44,7 +44,7 @@ def handler(): descriptions = {x["walls-description"] for x in data} out = [] for description in descriptions: - res = WallAttributes(description).clean() + res = WallAttributes(description).process_description() out.append( { "original_description": description, diff --git a/epc_data/attributes/WallAttributes.py b/epc_data/attributes/WallAttributes.py index 4b93f894..7f8de419 100644 --- a/epc_data/attributes/WallAttributes.py +++ b/epc_data/attributes/WallAttributes.py @@ -1,11 +1,8 @@ -from epc_data.attributes.attribute_utils import extract_thermal_transmittence -from typing import Optional -import nltk -from nltk.corpus import stopwords +import re +from typing import Dict, Union class WallAttributes: - IGNORE_STOP_WORDS = ["no"] def __init__(self, description: str): """ @@ -13,180 +10,50 @@ class WallAttributes: """ self.description: str = description - # TODO: Remove this out of here - nltk.download('stopwords', quiet=True) - self.stop_words = stopwords.words('english') - self.stop_words = [word for word in self.stop_words if word not in self.IGNORE_STOP_WORDS] + def process(self) -> Dict[str, Union[float, str, bool, None]]: + result: Dict[str, Union[float, str, bool, None]] = {} + description = self.description.lower() - def clean(self): - """ - - :return: - """ - - description_lower = self.description.lower().strip() - - thermal_transmittence: Optional[float] = None - thermal_transmittence_unit: Optional[str] = None - - is_cavity_wall: bool = "cavity wall" in description_lower - has_filled_cavity: bool = "filled cavity" in description_lower - is_solid_brick: bool = "solid brick" in description_lower - # TODO: Find out what this means - is_system_built - is_system_built: bool = "system built" in description_lower - is_timber_frame: bool = "timber frame" in description_lower - is_granite_or_whinstone: bool = "granite or whinstone" in description_lower - # The "as built" description indicates that these factors are based on the original construction - # specifications and materials. - as_built = "as built" in description_lower - assumed = "assumed" in description_lower - - insulation_thickness = None - if "thermal transmittance" in description_lower: - thermal_transmittence, thermal_transmittence_unit = extract_thermal_transmittence(description_lower) - - elif "insulation" in description_lower or "insulated" in description_lower: - insulation_thickness = self._characterise_insulation( - description_lower, - is_cavity_wall, - has_filled_cavity, - is_solid_brick, - as_built, - assumed, - is_system_built, - is_timber_frame, - is_granite_or_whinstone - ) - elif is_cavity_wall and has_filled_cavity: - # We can likely remove this branch of the if statement - thermal_transmittence, thermal_transmittence_unit, = None, None + # thermal transmittance - it can be negative which is errneous however we'll still pull it out + match = re.search(r"average thermal transmittance (-?\d+\.\d+)\s(w/m-¦k)", description) + if match: + result['thermal_transmittance'] = float(match.group(1)) + result['thermal_transmittance_unit'] = match.group(2) else: + result['thermal_transmittance'] = None + result['thermal_transmittance_unit'] = None - raise Exception("H") - - return self._make_output( - thermal_transmittence=thermal_transmittence, - thermal_transmittence_unit=thermal_transmittence_unit, - is_solid_brick=is_solid_brick, - insulation_thickness=insulation_thickness - ) - - def _characterise_insulation( - self, - description_lower, - is_cavity_wall, - has_filled_cavity, - is_solid_brick, - as_built, - assumed, - is_system_built, - is_timber_frame, - is_granite_or_whinstone - ): - - search_description = description_lower.replace("(assumed)", "").strip() if assumed else description_lower - search_description = search_description.replace("as built,", "").strip() if as_built else search_description - search_description = search_description.replace("system built,", "").strip() if \ - is_system_built else search_description - search_description = search_description.replace("timber frame,", "").strip() if \ - is_timber_frame else search_description - search_description = search_description.replace("granite or whinstone,", "").strip() if \ - is_granite_or_whinstone else search_description - search_description = search_description.replace("cavity wall,", "").strip() if \ - is_cavity_wall else search_description - search_description = search_description.replace("filled cavity", "").strip() if \ - has_filled_cavity else search_description - - characterisation_map = { - "external": "external", - "internal": "internal" - } + # wall type + result['is_cavity_wall'] = 'cavity wall' in description + result['has_filled_cavity'] = 'filled cavity' in description + result['is_solid_brick'] = 'solid brick' in description + result['is_system_built'] = 'system built' in description + result['is_timber_frame'] = 'timber frame' in description + result['is_granite_or_whinstone'] = 'granite' in description or 'whinstone' in description + result['as_built'] = 'as built' in description + result['is_cob'] = 'cob' in description + result['assumed'] = 'assumed' in description + result['is_sandstone_or_limestone'] = 'sandstone or limestone' in description + # insulation thickness - this is far from a perfect approach and we'd likely need to use nlp to do this + # generally however this is sufficient for mvp thickness_map = { - "external": "average", - "internal": "average", - "partial": "below average", - "no": "none", - # TODO: CHECK IF ADDITIONAL = ABOVE AVERAGE - "additional": "above average" + "external insulation": "average", + "internal insulation": "average", + "partial insulation": "below average", + "no insulation": "none", + "additional insulation": "above average", + "insulated": "average" } - - insulation_term = "insulation" if "insulation" in search_description else "insulated" - - search_description = search_description.replace(insulation_term, "").strip() - - # TODO: We might not need all these if statements.. - if is_cavity_wall: - - if search_description == "": - insulation_thickness = "average" - insulation_characteristic = None - else: - insulation_characteristic = characterisation_map.get( - search_description.split(" ")[-1] - ) - - insulation_thickness = [k for k in thickness_map if k in search_description] - if not insulation_thickness or len(insulation_thickness) > 1: - raise Exception("Check me out") - - insulation_thickness = thickness_map.get(insulation_thickness[0]) - - if not insulation_thickness: - raise NotImplementedError("Implement me! - insulation_thickness") - elif is_solid_brick: - desc_split = search_description.split("solid brick,")[-1].strip().split("as built,")[-1] - - if desc_split == "": - insulation_thickness = "average" - insulation_characteristic = None - else: - insulation_thickness = thickness_map.get(desc_split.split(insulation_term)[0].strip()) - if not insulation_thickness: - insulation_thickness = "average" - - insulation_characteristic = characterisation_map.get(search_description.split(" ")[-1]) - - elif is_system_built: - if search_description == "": - insulation_thickness = "average" - insulation_characteristic = None - else: - insulation_characteristic = characterisation_map.get(search_description.split(" ")[-1]) - insulation_thickness = [k for k in thickness_map if k in search_description] - if not insulation_thickness or len(insulation_thickness) > 1: - raise Exception("Check me out") - - insulation_thickness = thickness_map.get(insulation_thickness[0]) - elif is_timber_frame: - if search_description == "": - insulation_thickness = "average" - insulation_characteristic = None - else: - insulation_characteristic = characterisation_map.get(search_description.split(" ")[-1]) - - insulation_thickness = [k for k in thickness_map if k in search_description] - if not insulation_thickness or len(insulation_thickness) > 1: - raise Exception("Check me out") - - insulation_thickness = thickness_map.get(insulation_thickness[0]) - + for key, value in thickness_map.items(): + if key in description: + result['insulation_thickness'] = value + break else: - insulation_characteristic = characterisation_map.get(search_description.split(" ")[-1]) + result['insulation_thickness'] = None - insulation_thickness = [k for k in thickness_map if k in search_description] - if not insulation_thickness or len(insulation_thickness) > 1: - raise Exception("Check me out") + # insulation type + result['external_insulation'] = 'external insulation' in description + result['internal_insulation'] = 'internal insulation' in description - insulation_thickness = thickness_map.get(insulation_thickness[0]) - - return insulation_thickness, insulation_characteristic - - @staticmethod - def _make_output(thermal_transmittence, thermal_transmittence_unit, is_solid_brick, insulation_thickness): - return { - "thermal_transmittence": thermal_transmittence, - "thermal_transmittence_unit": thermal_transmittence_unit, - "is_solid_brick": is_solid_brick, - "insulation_thickness": insulation_thickness - } + return result