import re from typing import List, Dict, Any, Union, Tuple from collections import Counter class EpcClean: """ Container for methods which we utilise for cleaning EPC data """ U_VALUE_REGEX = re.compile(r"(\d+\.\d+)") UNIT_REGEX = re.compile(r"(w/m-¦k)") CLEANING_FIELDS: List[str] = [ "roof-description", "floor-description", "walls-description", "mainheat-description" ] def __init__(self, data: List[Dict[str, Any]]) -> None: """ EpcClean constructor. :param data: List of dictionaries containing EPC data. """ self.data: List[Dict[str, Any]] = data self.unique_vals: Dict[str, Any] = {} self.cleaned: Dict[str, List[Any]] = {} def clean(self) -> None: """ Cleans the EPC data, mapping text fields to property attributes. """ self._init_empty_cleaned_obj() for field in self.CLEANING_FIELDS: self.unique_vals[field] = Counter([v[field] for v in self.data]) for description in self.unique_vals["roof-description"].keys(): self.cleaned["roof-description"].append( {"original": description, "cleaned": self.clean_roof(description)} ) def _init_empty_cleaned_obj(self) -> None: """ Initializes an empty object for cleaned data. """ self.cleaned = {field: [] for field in self.CLEANING_FIELDS} @staticmethod def _search_split_roof_description(desc: str) -> str: """ Searches roof descriptions and looks for key words, determining a description about the roof's insulation. :param desc: Description to be searched. :return: Result of the search. """ if desc == "insulated": return "average" if desc == "limited": return "below average" raise NotImplementedError("Handle me") def _find_insulation_thickness( self, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool ) -> Union[int, str, None]: """ Finds insulation thickness in the description. :param description_lower: Lowercase description. :param is_pitched: Whether the roof is pitched. :param is_roof_room: Whether there is a room in the roof. :param is_flat: Whether the roof is flat. :return: Insulation thickness if found, else None. """ if "no insulation" in description_lower: return 0 if is_pitched: try: thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip() if "+" in thickness: return thickness try: return int(thickness) except ValueError as int_error: raise ValueError(int_error) except ValueError as _: if "invalid input" in description_lower: return None desc = description_lower.split("pitched,")[-1].strip().split(" ")[0] return self._search_split_roof_description(desc) if is_roof_room: desc_split_lookup = { "ceiling insulated": "average", "thatched": "average", } # Just search for specific phrases desc_split = description_lower.split("roof room(s),")[-1].strip() res = desc_split_lookup.get(desc_split) if res: return res desc = desc_split.split(" ")[0] return self._search_split_roof_description(desc) if is_flat: # Just search for specific phrases desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0] return self._search_split_roof_description(desc) return None def _extract_thermal_transmittence(self, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]: """ Extracts thermal transmittance from the description. :param description_lower: Lowercase description. :return: Tuple containing U-value and unit. """ # Find U-value u_value = re.search(self.U_VALUE_REGEX, description_lower) if u_value is not None: u_value = float(u_value.group(1)) else: u_value = None # Find unit unit = re.search(self.UNIT_REGEX, description_lower) if unit is not None: unit = unit.group(1) else: unit = None return u_value, unit def clean_roof(self, description: str) -> Dict[str, Union[str, bool, int, None]]: """ We aim to extract features about the roof, so we can characterise it. We will check: - If the roof is pitched - If there is a room roof - if there is a loft - If it has insulation - if so, what degree of insulation :param description: Description of the roof. :return: Dictionary of attributes of the roof. """ description_lower = description.lower().strip() if "another dwelling above" in description_lower or "other premises above" in description_lower: return { "is_pitched": False, "is_roof_room": False, "has_loft": False, "insulation_thickness": 0, "has_dwelling_above": True, "assumed": "assumed" in description_lower, "is_flat": "flat" in description_lower, "is_thatched": False, "thermal_transmittence": None, "thermal_transmittence_unit": None, } is_pitched = "pitched" in description_lower is_roof_room = "roof room" in description_lower has_loft = "loft" in description_lower is_flat = "flat" in description_lower is_thatched = "thatched" in description_lower thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None if "insulation" in description_lower or "insulated" in description_lower: insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat) elif "thermal transmittance" in description_lower: thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) elif is_thatched: # Search for these features: thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) insulation_thickness = self._find_insulation_thickness( description_lower, is_pitched, is_roof_room, is_flat ) else: raise NotImplementedError("Not handles this") attributes = { "is_pitched": is_pitched, "is_roof_room": is_roof_room, "has_loft": has_loft, "insulation_thickness": insulation_thickness, "has_dwelling_above": False, "assumed": "assumed" in description_lower, "is_flat": is_flat, "thermal_transmittence": thermal_transmittence, "thermal_transmittence_unit": thermal_transmittence_unit } return attributes