diff --git a/epc_data/app.py b/epc_data/app.py index b7ff54bd..9231fca8 100644 --- a/epc_data/app.py +++ b/epc_data/app.py @@ -34,145 +34,6 @@ def handler(): ) # TODO: Fill this - ClEANING_FIELDS = [ - "roof-description", - "floor-description", - "walls-description", - "mainheat-description" - ] field = "roof-description" - unique_vals = Counter([v[field] for v in data]) - def search_description_options(desc): - if desc == "insulated": - return "average" - if desc == "limited": - return "below average" - raise Exception("Handle me") - - def find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat): - - if "no insulation" in description_lower: - return 0 - - if is_pitched: - try: - - thickness = description_lower.split("pitched,")[-1].split("mm")[0].lstrip().rstrip() - if "+" in thickness: - return thickness - return int(thickness) - except ValueError as _: - if "invalid input" in description_lower: - return None - desc = description_lower.split("pitched,")[-1].lstrip().split(" ")[0] - return search_description_options(desc) - - if is_roof_room: - desc_split_lookup = { - "ceiling insulated": "average", - "thatched": "average", - } - # Just search for specific phrases - desc_split = description_lower.split("roof room(s),")[-1].lstrip() - res = desc_split_lookup.get(desc_split) - if res: - return res - - desc = desc_split.split(" ")[0] - return search_description_options(desc) - - if is_flat: - # Just search for specific phrases - desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0] - return search_description_options(desc) - - return None - - import re - def extract_thermal_transmittence(description_lower): - # Find U-value - u_value = re.search(r"(\d+\.\d+)", description_lower) - if u_value is not None: - u_value = float(u_value.group(1)) - else: - u_value = None - - # Find unit - unit = re.search(r"(w/m-¦k)", description_lower) - if unit is not None: - unit = unit.group(1) - else: - unit = None - - return u_value, unit - - def clean_roof(description): - """ - We aim to extract features about the roof, so we can characterise it. We will check: - - If the roof is pitched - - If there is a room roof - - if there is a loft - - If it has insulation - - if so, what degree of insulation - - - - :param x: - :return: - """ - description_lower = description.lower().lstrip().rstrip() - - if "another dwelling above" in description_lower or "other premises above" in description_lower: - return { - "is_pitched": False, - "is_roof_room": False, - "has_loft": False, - "insulation_thickness": 0, - "has_dwelling_above": True, - "assumed": "assumed" in description_lower, - "is_flat": "flat" in description_lower, - "is_thatched": False, - "thermal_transmittence": None, - "thermal_transmittence_unit": None, - } - - is_pitched = "pitched" in description_lower - is_roof_room = "roof room" in description_lower - has_loft = "loft" in description_lower - is_flat = "flat" in description_lower - is_thatched = "thatched" in description_lower - - thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None - if "insulation" in description_lower or "insulated" in description_lower: - insulation_thickness = find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat) - elif "thermal transmittance" in description_lower: - thermal_transmittence, thermal_transmittence_unit = extract_thermal_transmittence(description_lower) - elif is_thatched: - # Search for these features: - thermal_transmittence, thermal_transmittence_unit = extract_thermal_transmittence(description_lower) - insulation_thickness = find_insulation_thickness( - description_lower, is_pitched, is_roof_room, is_flat - ) - else: - raise Exception("Implment me 2") - - attributes = { - "is_pitched": is_pitched, - "is_roof_room": is_roof_room, - "has_loft": has_loft, - "insulation_thickness": insulation_thickness, - "has_dwelling_above": False, - "assumed": "assumed" in description_lower, - "is_flat": is_flat, - "thermal_transmittence": thermal_transmittence, - "thermal_transmittence_unit": thermal_transmittence_unit - } - - return attributes - - cleaned_roof = [] - for description in unique_vals.keys(): - cleaned_roof.append( - {"original": description, "cleaned": clean_roof(description)} - ) diff --git a/epc_data/requirements.txt b/epc_data/requirements.txt index fbe37dd1..dbad3478 100644 --- a/epc_data/requirements.txt +++ b/epc_data/requirements.txt @@ -1,4 +1,5 @@ epc-api-python python-dotenv tqdm -pandas \ No newline at end of file +pandas +mypy \ No newline at end of file diff --git a/epc_data/tests/EpcClean.py b/epc_data/tests/EpcClean.py new file mode 100644 index 00000000..d2153320 --- /dev/null +++ b/epc_data/tests/EpcClean.py @@ -0,0 +1,167 @@ +import re +from collections import Counter, defaultdict + + +class EpcClean: + """ + Container for methods which we utilise for cleaning EPC data + """ + + CLEANING_FIELDS = [ + "roof-description", + "floor-description", + "walls-description", + "mainheat-description" + ] + + def __init__(self, data): + self.data = data + self.unique_vals = {} + + self.cleaned = {} + + def clean(self): + """ + This method cleans the EPC data, mapping text fields to propety attributes + :return: + """ + self._init_empty_cleaned_obj() + + for field in self.CLEANING_FIELDS: + self.unique_vals[field] = Counter([v[field] for v in self.data]) + + for description in self.unique_vals["roof-description"].keys(): + self.cleaned["roof-description"].append( + {"original": description, "cleaned": self.clean_roof(description)} + ) + + def _init_empty_cleaned_obj(self): + self.cleaned = defaultdict(list, {k: [] for k in self.CLEANING_FIELDS}) + + @staticmethod + def search_description_options(desc): + if desc == "insulated": + return "average" + if desc == "limited": + return "below average" + raise Exception("Handle me") + + def _find_insulation_thickness(self, description_lower, is_pitched, is_roof_room, is_flat): + + if "no insulation" in description_lower: + return 0 + + if is_pitched: + try: + + thickness = description_lower.split("pitched,")[-1].split("mm")[0].lstrip().rstrip() + if "+" in thickness: + return thickness + return int(thickness) + except ValueError as _: + if "invalid input" in description_lower: + return None + desc = description_lower.split("pitched,")[-1].lstrip().split(" ")[0] + return self._search_description_options(desc) + + if is_roof_room: + desc_split_lookup = { + "ceiling insulated": "average", + "thatched": "average", + } + # Just search for specific phrases + desc_split = description_lower.split("roof room(s),")[-1].lstrip() + res = desc_split_lookup.get(desc_split) + if res: + return res + + desc = desc_split.split(" ")[0] + return self._search_description_options(desc) + + if is_flat: + # Just search for specific phrases + desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0] + return self._search_description_options(desc) + + return None + + @staticmethod + def _extract_thermal_transmittence(description_lower): + # Find U-value + u_value = re.search(r"(\d+\.\d+)", description_lower) + if u_value is not None: + u_value = float(u_value.group(1)) + else: + u_value = None + + # Find unit + unit = re.search(r"(w/m-¦k)", description_lower) + if unit is not None: + unit = unit.group(1) + else: + unit = None + + return u_value, unit + + def clean_roof(self, description): + """ + We aim to extract features about the roof, so we can characterise it. We will check: + - If the roof is pitched + - If there is a room roof + - if there is a loft + - If it has insulation + - if so, what degree of insulation + - + + :param x: + :return: + """ + description_lower = description.lower().lstrip().rstrip() + + if "another dwelling above" in description_lower or "other premises above" in description_lower: + return { + "is_pitched": False, + "is_roof_room": False, + "has_loft": False, + "insulation_thickness": 0, + "has_dwelling_above": True, + "assumed": "assumed" in description_lower, + "is_flat": "flat" in description_lower, + "is_thatched": False, + "thermal_transmittence": None, + "thermal_transmittence_unit": None, + } + + is_pitched = "pitched" in description_lower + is_roof_room = "roof room" in description_lower + has_loft = "loft" in description_lower + is_flat = "flat" in description_lower + is_thatched = "thatched" in description_lower + + thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None + if "insulation" in description_lower or "insulated" in description_lower: + insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat) + elif "thermal transmittance" in description_lower: + thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) + elif is_thatched: + # Search for these features: + thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower) + insulation_thickness = self._find_insulation_thickness( + description_lower, is_pitched, is_roof_room, is_flat + ) + else: + raise Exception("Implment me 2") + + attributes = { + "is_pitched": is_pitched, + "is_roof_room": is_roof_room, + "has_loft": has_loft, + "insulation_thickness": insulation_thickness, + "has_dwelling_above": False, + "assumed": "assumed" in description_lower, + "is_flat": is_flat, + "thermal_transmittence": thermal_transmittence, + "thermal_transmittence_unit": thermal_transmittence_unit + } + + return attributes