diff --git a/epc_data/tests/EpcClean.py b/epc_data/tests/EpcClean.py index d2153320..cdd29c14 100644 --- a/epc_data/tests/EpcClean.py +++ b/epc_data/tests/EpcClean.py @@ -1,5 +1,6 @@ import re -from collections import Counter, defaultdict +from typing import List, Dict, Any, Union, Tuple +from collections import Counter class EpcClean: @@ -7,23 +8,29 @@ class EpcClean: Container for methods which we utilise for cleaning EPC data """ - CLEANING_FIELDS = [ + U_VALUE_REGEX = re.compile(r"(\d+\.\d+)") + UNIT_REGEX = re.compile(r"(w/m-¦k)") + + CLEANING_FIELDS: List[str] = [ "roof-description", "floor-description", "walls-description", "mainheat-description" ] - def __init__(self, data): - self.data = data - self.unique_vals = {} - - self.cleaned = {} - - def clean(self): + def __init__(self, data: List[Dict[str, Any]]) -> None: """ - This method cleans the EPC data, mapping text fields to propety attributes - :return: + EpcClean constructor. + + :param data: List of dictionaries containing EPC data. + """ + self.data: List[Dict[str, Any]] = data + self.unique_vals: Dict[str, Any] = {} + self.cleaned: Dict[str, List[Any]] = {} + + def clean(self) -> None: + """ + Cleans the EPC data, mapping text fields to property attributes. """ self._init_empty_cleaned_obj() @@ -35,34 +42,55 @@ class EpcClean: {"original": description, "cleaned": self.clean_roof(description)} ) - def _init_empty_cleaned_obj(self): - self.cleaned = defaultdict(list, {k: [] for k in self.CLEANING_FIELDS}) + def _init_empty_cleaned_obj(self) -> None: + """ + Initializes an empty object for cleaned data. + """ + self.cleaned = {field: [] for field in self.CLEANING_FIELDS} @staticmethod - def search_description_options(desc): + def _search_split_roof_description(desc: str) -> str: + """ + Searches roof descriptions and looks for key words, determining a description about the roof's insulation. + + :param desc: Description to be searched. + :return: Result of the search. + """ if desc == "insulated": return "average" if desc == "limited": return "below average" - raise Exception("Handle me") + raise NotImplementedError("Handle me") - def _find_insulation_thickness(self, description_lower, is_pitched, is_roof_room, is_flat): + def _find_insulation_thickness( + self, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool + ) -> Union[int, str, None]: + """ + Finds insulation thickness in the description. + :param description_lower: Lowercase description. + :param is_pitched: Whether the roof is pitched. + :param is_roof_room: Whether there is a room in the roof. + :param is_flat: Whether the roof is flat. + :return: Insulation thickness if found, else None. + """ if "no insulation" in description_lower: return 0 if is_pitched: try: - - thickness = description_lower.split("pitched,")[-1].split("mm")[0].lstrip().rstrip() + thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip() if "+" in thickness: return thickness - return int(thickness) + try: + return int(thickness) + except ValueError as int_error: + raise ValueError(int_error) except ValueError as _: if "invalid input" in description_lower: return None - desc = description_lower.split("pitched,")[-1].lstrip().split(" ")[0] - return self._search_description_options(desc) + desc = description_lower.split("pitched,")[-1].strip().split(" ")[0] + return self._search_split_roof_description(desc) if is_roof_room: desc_split_lookup = { @@ -70,32 +98,37 @@ class EpcClean: "thatched": "average", } # Just search for specific phrases - desc_split = description_lower.split("roof room(s),")[-1].lstrip() + desc_split = description_lower.split("roof room(s),")[-1].strip() res = desc_split_lookup.get(desc_split) if res: return res desc = desc_split.split(" ")[0] - return self._search_description_options(desc) + return self._search_split_roof_description(desc) if is_flat: # Just search for specific phrases desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0] - return self._search_description_options(desc) + return self._search_split_roof_description(desc) return None - @staticmethod - def _extract_thermal_transmittence(description_lower): + def _extract_thermal_transmittence(self, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]: + """ + Extracts thermal transmittance from the description. + + :param description_lower: Lowercase description. + :return: Tuple containing U-value and unit. + """ # Find U-value - u_value = re.search(r"(\d+\.\d+)", description_lower) + u_value = re.search(self.U_VALUE_REGEX, description_lower) if u_value is not None: u_value = float(u_value.group(1)) else: u_value = None # Find unit - unit = re.search(r"(w/m-¦k)", description_lower) + unit = re.search(self.UNIT_REGEX, description_lower) if unit is not None: unit = unit.group(1) else: @@ -103,7 +136,7 @@ class EpcClean: return u_value, unit - def clean_roof(self, description): + def clean_roof(self, description: str) -> Dict[str, Union[str, bool, int, None]]: """ We aim to extract features about the roof, so we can characterise it. We will check: - If the roof is pitched @@ -111,12 +144,11 @@ class EpcClean: - if there is a loft - If it has insulation - if so, what degree of insulation - - - :param x: - :return: + :param description: Description of the roof. + :return: Dictionary of attributes of the roof. """ - description_lower = description.lower().lstrip().rstrip() + description_lower = description.lower().strip() if "another dwelling above" in description_lower or "other premises above" in description_lower: return { @@ -150,7 +182,7 @@ class EpcClean: description_lower, is_pitched, is_roof_room, is_flat ) else: - raise Exception("Implment me 2") + raise NotImplementedError("Not handles this") attributes = { "is_pitched": is_pitched,