From e927e6d41f4e266a04d19c319934edca930bf536 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 8 Jun 2023 16:46:52 +0100 Subject: [PATCH] Broken up the cleaning class --- epc_data/EpcClean.py | 233 ++---------------- epc_data/app.py | 4 +- epc_data/cleaning/Roof.py | 218 ++++++++++++++++ .../{test_EpcClean.py => test_clean_roof.py} | 17 +- 4 files changed, 248 insertions(+), 224 deletions(-) create mode 100644 epc_data/cleaning/Roof.py rename epc_data/tests/{test_EpcClean.py => test_clean_roof.py} (81%) diff --git a/epc_data/EpcClean.py b/epc_data/EpcClean.py index 08f024f2..a70d3d25 100644 --- a/epc_data/EpcClean.py +++ b/epc_data/EpcClean.py @@ -1,16 +1,14 @@ -import re -from typing import List, Dict, Any, Union, Tuple, Optional +from typing import List, Dict, Any from collections import Counter +from epc_data.cleaning.Roof import CleanRoof + class EpcClean: """ Container for methods which we utilise for cleaning EPC data """ - U_VALUE_REGEX = re.compile(r"(\d+\.\d+)") - UNIT_REGEX = re.compile(r"(w/m-¦k)") - CLEANING_FIELDS: List[str] = [ "roof-description", "floor-description", @@ -37,13 +35,15 @@ class EpcClean: for field in self.CLEANING_FIELDS: self.unique_vals[field] = Counter([v[field] for v in self.data]) - for description in self.unique_vals["roof-description"].keys(): - self.cleaned["roof-description"].append( - { - "original_description": description, - **self.clean_roof(description) - } - ) + self.clean_roof() + + # for description in self.unique_vals["floor-description"].keys(): + # self.cleaned["floor-description"].append( + # { + # "original_description": description, + # **self.clean_floor(description) + # } + # ) def _init_empty_cleaned_obj(self) -> None: """ @@ -51,206 +51,11 @@ class EpcClean: """ self.cleaned = {field: [] for field in self.CLEANING_FIELDS} - @staticmethod - def _search_split_roof_description(desc: str) -> str: - """ - Searches roof descriptions and looks for key words, determining a description about the roof's insulation. - - :param desc: Description to be searched. - :return: Result of the search. - """ - if desc == "insulated": - return "average" - if desc == "limited": - return "below average" - raise NotImplementedError("Handle me") - - def _find_insulation_thickness( - self, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool - ) -> Union[int, str, None]: - """ - Finds insulation thickness in the description. - - :param description_lower: Lowercase description. - :param is_pitched: Whether the roof is pitched. - :param is_roof_room: Whether there is a room in the roof. - :param is_flat: Whether the roof is flat. - :return: Insulation thickness if found, else None. - """ - if "no insulation" in description_lower: - return 0 - - if is_pitched: - try: - thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip() - if "+" in thickness: - return thickness - try: - return int(thickness) - except ValueError as int_error: - raise ValueError(int_error) - except ValueError as _: - if "invalid input" in description_lower: - return None - desc = description_lower.split("pitched,")[-1].strip().split(" ")[0] - return self._search_split_roof_description(desc) - - if is_roof_room: - desc_split_lookup = { - "ceiling insulated": "average", - "thatched": "average", - } - # Just search for specific phrases - desc_split = description_lower.split("roof room(s),")[-1].strip() - res = desc_split_lookup.get(desc_split) - if res: - return res - - desc = desc_split.split(" ")[0] - return self._search_split_roof_description(desc) - - if is_flat: - # Just search for specific phrases - desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0] - return self._search_split_roof_description(desc) - - return None - - def _extract_thermal_transmittence(self, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]: - """ - Extracts thermal transmittance from the description. - - :param description_lower: Lowercase description. - :return: Tuple containing U-value and unit. - """ - # Find U-value - u_value = re.search(self.U_VALUE_REGEX, description_lower) - if u_value is not None: - u_value = float(u_value.group(1)) - else: - u_value = None - - # Find unit - unit = re.search(self.UNIT_REGEX, description_lower) - if unit is not None: - unit = unit.group(1) - else: - unit = None - - return u_value, unit - - @staticmethod - def _make_clean_roof_output( - is_valid: bool, - at_rafters: bool, - is_pitched: bool, - is_roof_room: bool, - has_loft: bool, - insulation_thickness: str | int | None, - has_dwelling_above: bool, - assumed: bool, - is_flat: bool, - is_thatched: bool, - thermal_transmittence: Optional[float], - thermal_transmittence_unit: Optional[str] - ) -> Dict[str, Union[bool, str, None]]: - """ - Utility function to ensure all the keys are present in the output. - - :param is_valid: True if the roof descrption is valid, False otherwise - :param at_rafters: True if the insulation is at the rafters, False otherwise - :param is_pitched: True if the roof is pitched, False otherwise - :param is_roof_room: True if there is a room in the roof, False otherwise - :param has_loft: True if there is a loft, False otherwise - :param insulation_thickness: The thickness of the insulation - :param has_dwelling_above: True if there is a dwelling above, False otherwise - :param assumed: True if the roof type was assumed based on property age, False otherwise - :param is_flat: True if the roof is flat, False otherwise - :param is_thatched: True if the roof is thatched, False otherwise - :param thermal_transmittence: The thermal transmittence value of the roof, if known - :param thermal_transmittence_unit: The unit of thermal transmittence, if known - :return: A dictionary containing all the information about the roof. - """ - - return { - "is_valid": is_valid, - "at_rafters": at_rafters, - "is_pitched": is_pitched, - "is_roof_room": is_roof_room, - "has_loft": has_loft, - "insulation_thickness": insulation_thickness, - "has_dwelling_above": has_dwelling_above, - "assumed": assumed, - "is_flat": is_flat, - "is_thatched": is_thatched, - "thermal_transmittence": thermal_transmittence, - "thermal_transmittence_unit": thermal_transmittence_unit - } - - def clean_roof(self, description: str) -> Dict[str, Union[str, bool, int, None]]: - """ - We aim to extract features about the roof, so we can characterise it. We will check: - - If the roof is pitched - - If there is a room roof - - if there is a loft - - If it has insulation - - if so, what degree of insulation - - :param description: Description of the roof. - :return: Dictionary of attributes of the roof. - """ - description_lower = description.lower().strip() - - if "another dwelling above" in description_lower or "other premises above" in description_lower: - return self._make_clean_roof_output( - is_valid="invalid" not in description_lower, - at_rafters="at rafters" in description_lower, - is_pitched=False, - is_roof_room=False, - has_loft=False, - insulation_thickness=0, - has_dwelling_above=True, - assumed="assumed" in description_lower, - is_flat="flat" in description_lower, - is_thatched=False, - thermal_transmittence=None, - thermal_transmittence_unit=None + def clean_roof(self): + for description in self.unique_vals["roof-description"].keys(): + self.cleaned["roof-description"].append( + { + "original_description": description, + **CleanRoof(description).clean() + } ) - - is_pitched = "pitched" in description_lower - is_roof_room = "roof room" in description_lower - has_loft = "loft" in description_lower - is_flat = "flat" in description_lower - is_thatched = "thatched" in description_lower - at_rafters = "at rafters" in description_lower - - thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None - if "insulation" in description_lower or "insulated" in description_lower: - insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat) - elif "thermal transmittance" in description_lower: - thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) - elif is_thatched: - # Search for these features: - thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) - insulation_thickness = self._find_insulation_thickness( - description_lower, is_pitched, is_roof_room, is_flat - ) - elif description_lower == "pitched": - thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None - else: - raise NotImplementedError("Not handled this") - - return self._make_clean_roof_output( - is_valid="invalid" not in description_lower, - at_rafters=at_rafters, - is_pitched=is_pitched, - is_roof_room=is_roof_room, - has_loft=has_loft, - insulation_thickness=insulation_thickness, - has_dwelling_above=False, - assumed="assumed" in description_lower, - is_flat=is_flat, - is_thatched=is_thatched, - thermal_transmittence=thermal_transmittence, - thermal_transmittence_unit=thermal_transmittence_unit - ) diff --git a/epc_data/app.py b/epc_data/app.py index 7788b7d2..6c3b48d1 100644 --- a/epc_data/app.py +++ b/epc_data/app.py @@ -38,5 +38,5 @@ def handler(): cleaner.clean() - import pandas as pd - df = pd.DataFrame(cleaner.cleaned["roof-description"]) + + diff --git a/epc_data/cleaning/Roof.py b/epc_data/cleaning/Roof.py new file mode 100644 index 00000000..7793d4e8 --- /dev/null +++ b/epc_data/cleaning/Roof.py @@ -0,0 +1,218 @@ +import re +from typing import Dict, Union, Tuple, Optional + + +class CleanRoof: + U_VALUE_REGEX = re.compile(r"(\d+\.\d+)") + UNIT_REGEX = re.compile(r"(w/m-¦k)") + + def __init__(self, description): + """ + :param description: Description of the roof. + """ + self.description: str = description + + def clean(self) -> Dict[str, Union[str, bool, int, None]]: + """ + We aim to extract features about the roof, so we can characterise it. We will check: + - If the roof is pitched + - If there is a room roof + - if there is a loft + - If it has insulation + - if so, what degree of insulation + + :return: Dictionary of attributes of the roof. + """ + description_lower = self.description.lower().strip() + + if "another dwelling above" in description_lower or "other premises above" in description_lower: + return self._make_clean_output( + is_valid="invalid" not in description_lower, + at_rafters="at rafters" in description_lower, + is_pitched=False, + is_roof_room=False, + has_loft=False, + insulation_thickness=0, + has_dwelling_above=True, + assumed="assumed" in description_lower, + is_flat="flat" in description_lower, + is_thatched=False, + thermal_transmittence=None, + thermal_transmittence_unit=None + ) + + is_pitched = "pitched" in description_lower + is_roof_room = "roof room" in description_lower + has_loft = "loft" in description_lower + is_flat = "flat" in description_lower + is_thatched = "thatched" in description_lower + at_rafters = "at rafters" in description_lower + + thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None + if "insulation" in description_lower or "insulated" in description_lower: + insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat) + elif "thermal transmittance" in description_lower: + thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) + elif is_thatched: + # Search for these features: + thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) + insulation_thickness = self._find_insulation_thickness( + description_lower, is_pitched, is_roof_room, is_flat + ) + elif description_lower == "pitched": + thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None + else: + raise NotImplementedError("Not handled this") + + return self._make_clean_output( + is_valid="invalid" not in description_lower, + at_rafters=at_rafters, + is_pitched=is_pitched, + is_roof_room=is_roof_room, + has_loft=has_loft, + insulation_thickness=insulation_thickness, + has_dwelling_above=False, + assumed="assumed" in description_lower, + is_flat=is_flat, + is_thatched=is_thatched, + thermal_transmittence=thermal_transmittence, + thermal_transmittence_unit=thermal_transmittence_unit + ) + + @staticmethod + def _make_clean_output( + is_valid: bool, + at_rafters: bool, + is_pitched: bool, + is_roof_room: bool, + has_loft: bool, + insulation_thickness: str | int | None, + has_dwelling_above: bool, + assumed: bool, + is_flat: bool, + is_thatched: bool, + thermal_transmittence: Optional[float], + thermal_transmittence_unit: Optional[str] + ) -> Dict[str, Union[bool, str, None]]: + """ + Utility function to ensure all the keys are present in the output. + + :param is_valid: True if the roof descrption is valid, False otherwise + :param at_rafters: True if the insulation is at the rafters, False otherwise + :param is_pitched: True if the roof is pitched, False otherwise + :param is_roof_room: True if there is a room in the roof, False otherwise + :param has_loft: True if there is a loft, False otherwise + :param insulation_thickness: The thickness of the insulation + :param has_dwelling_above: True if there is a dwelling above, False otherwise + :param assumed: True if the roof type was assumed based on property age, False otherwise + :param is_flat: True if the roof is flat, False otherwise + :param is_thatched: True if the roof is thatched, False otherwise + :param thermal_transmittence: The thermal transmittence value of the roof, if known + :param thermal_transmittence_unit: The unit of thermal transmittence, if known + :return: A dictionary containing all the information about the roof. + """ + + return { + "is_valid": is_valid, + "at_rafters": at_rafters, + "is_pitched": is_pitched, + "is_roof_room": is_roof_room, + "has_loft": has_loft, + "insulation_thickness": insulation_thickness, + "has_dwelling_above": has_dwelling_above, + "assumed": assumed, + "is_flat": is_flat, + "is_thatched": is_thatched, + "thermal_transmittence": thermal_transmittence, + "thermal_transmittence_unit": thermal_transmittence_unit + } + + @staticmethod + def _search_split_description(desc: str) -> str: + """ + Searches roof descriptions and looks for key words, determining a description about the roof's insulation. + + :param desc: Description to be searched. + :return: Result of the search. + """ + if desc == "insulated": + return "average" + if desc == "limited": + return "below average" + raise NotImplementedError("Handle me") + + @classmethod + def _find_insulation_thickness( + cls, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool + ) -> Union[int, str, None]: + """ + Finds insulation thickness in the description. + + :param description_lower: Lowercase description. + :param is_pitched: Whether the roof is pitched. + :param is_roof_room: Whether there is a room in the roof. + :param is_flat: Whether the roof is flat. + :return: Insulation thickness if found, else None. + """ + if "no insulation" in description_lower: + return 0 + + if is_pitched: + try: + thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip() + if "+" in thickness: + return thickness + try: + return int(thickness) + except ValueError as int_error: + raise ValueError(int_error) + except ValueError as _: + if "invalid input" in description_lower: + return None + desc = description_lower.split("pitched,")[-1].strip().split(" ")[0] + return cls._search_split_description(desc) + + if is_roof_room: + desc_split_lookup = { + "ceiling insulated": "average", + "thatched": "average", + } + # Just search for specific phrases + desc_split = description_lower.split("roof room(s),")[-1].strip() + res = desc_split_lookup.get(desc_split) + if res: + return res + + desc = desc_split.split(" ")[0] + return cls._search_split_description(desc) + + if is_flat: + # Just search for specific phrases + desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0] + return cls._search_split_description(desc) + + return None + + @classmethod + def _extract_thermal_transmittence(cls, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]: + """ + Extracts thermal transmittance from the description. + + :param description_lower: Lowercase description. + :return: Tuple containing U-value and unit. + """ + # Find U-value + u_value = re.search(cls.U_VALUE_REGEX, description_lower) + if u_value is not None: + u_value = float(u_value.group(1)) + else: + u_value = None + + # Find unit + unit = re.search(cls.UNIT_REGEX, description_lower) + if unit is not None: + unit = unit.group(1) + else: + unit = None + + return u_value, unit diff --git a/epc_data/tests/test_EpcClean.py b/epc_data/tests/test_clean_roof.py similarity index 81% rename from epc_data/tests/test_EpcClean.py rename to epc_data/tests/test_clean_roof.py index 901d250e..a93c0564 100644 --- a/epc_data/tests/test_EpcClean.py +++ b/epc_data/tests/test_clean_roof.py @@ -3,6 +3,7 @@ import pickle from epc_data.EpcClean import EpcClean from pathlib import Path from epc_data.tests.test_data.EpcClean_test_roof_cases import clean_roof_test_cases +from epc_data.cleaning.Roof import CleanRoof # For local testing if __file__ == "": @@ -32,20 +33,20 @@ class TestEpcClean: assert all([len(values) == 0 for values in self.cleaner.cleaned.values()]) def test__search_split_roof_description(self): - assert self.cleaner._search_split_roof_description("insulated") == "average" - assert self.cleaner._search_split_roof_description("limited") == "below average" + assert CleanRoof._search_split_description("insulated") == "average" + assert CleanRoof._search_split_description("limited") == "below average" with pytest.raises(NotImplementedError): - self.cleaner._search_split_roof_description("unknown") + CleanRoof._search_split_description("unknown") def test__find_insulation_thickness(self): - assert self.cleaner._find_insulation_thickness("no insulation", False, False, False) == 0 + assert CleanRoof._find_insulation_thickness("no insulation", False, False, False) == 0 def test__extract_thermal_transmittence(self): description = "U-value of 2.3 w/m-¦k" - assert self.cleaner._extract_thermal_transmittence(description) == (2.3, "w/m-¦k") + assert CleanRoof._extract_thermal_transmittence(description) == (2.3, "w/m-¦k") def test_clean_roof(self): - result = self.cleaner.clean_roof('Pitched, 270 mm loft insulation') + result = CleanRoof('Pitched, 270 mm loft insulation').clean() # change the expected output based on your requirement expected_output = { @@ -66,7 +67,7 @@ class TestEpcClean: assert result == expected_output for test_case in clean_roof_test_cases: - result = self.cleaner.clean_roof(test_case['original_description']) + result = CleanRoof(test_case['original_description']).clean() # Ensure the output ordering is correct expected_result = {key: test_case[key] for key in result.keys()} expected_result["desc"] = test_case["original_description"] @@ -74,7 +75,7 @@ class TestEpcClean: assert result == expected_result def test_clean_roof_with_dwelling_above(self): - result = self.cleaner.clean_roof('(another dwelling above)') + result = CleanRoof('(another dwelling above)').clean() expected_output = { "is_valid": True,