Refactored code to add typings, stricter handling

This commit is contained in:
Khalim Conn-Kowlessar 2023-06-08 13:40:26 +01:00
parent 6fc27d4797
commit 8e35201e76

View file

@ -1,5 +1,6 @@
import re
from collections import Counter, defaultdict
from typing import List, Dict, Any, Union, Tuple
from collections import Counter
class EpcClean:
@ -7,23 +8,29 @@ class EpcClean:
Container for methods which we utilise for cleaning EPC data
"""
CLEANING_FIELDS = [
U_VALUE_REGEX = re.compile(r"(\d+\.\d+)")
UNIT_REGEX = re.compile(r"(w/m-¦k)")
CLEANING_FIELDS: List[str] = [
"roof-description",
"floor-description",
"walls-description",
"mainheat-description"
]
def __init__(self, data):
self.data = data
self.unique_vals = {}
self.cleaned = {}
def clean(self):
def __init__(self, data: List[Dict[str, Any]]) -> None:
"""
This method cleans the EPC data, mapping text fields to propety attributes
:return:
EpcClean constructor.
:param data: List of dictionaries containing EPC data.
"""
self.data: List[Dict[str, Any]] = data
self.unique_vals: Dict[str, Any] = {}
self.cleaned: Dict[str, List[Any]] = {}
def clean(self) -> None:
"""
Cleans the EPC data, mapping text fields to property attributes.
"""
self._init_empty_cleaned_obj()
@ -35,34 +42,55 @@ class EpcClean:
{"original": description, "cleaned": self.clean_roof(description)}
)
def _init_empty_cleaned_obj(self):
self.cleaned = defaultdict(list, {k: [] for k in self.CLEANING_FIELDS})
def _init_empty_cleaned_obj(self) -> None:
"""
Initializes an empty object for cleaned data.
"""
self.cleaned = {field: [] for field in self.CLEANING_FIELDS}
@staticmethod
def search_description_options(desc):
def _search_split_roof_description(desc: str) -> str:
"""
Searches roof descriptions and looks for key words, determining a description about the roof's insulation.
:param desc: Description to be searched.
:return: Result of the search.
"""
if desc == "insulated":
return "average"
if desc == "limited":
return "below average"
raise Exception("Handle me")
raise NotImplementedError("Handle me")
def _find_insulation_thickness(self, description_lower, is_pitched, is_roof_room, is_flat):
def _find_insulation_thickness(
self, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool
) -> Union[int, str, None]:
"""
Finds insulation thickness in the description.
:param description_lower: Lowercase description.
:param is_pitched: Whether the roof is pitched.
:param is_roof_room: Whether there is a room in the roof.
:param is_flat: Whether the roof is flat.
:return: Insulation thickness if found, else None.
"""
if "no insulation" in description_lower:
return 0
if is_pitched:
try:
thickness = description_lower.split("pitched,")[-1].split("mm")[0].lstrip().rstrip()
thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip()
if "+" in thickness:
return thickness
return int(thickness)
try:
return int(thickness)
except ValueError as int_error:
raise ValueError(int_error)
except ValueError as _:
if "invalid input" in description_lower:
return None
desc = description_lower.split("pitched,")[-1].lstrip().split(" ")[0]
return self._search_description_options(desc)
desc = description_lower.split("pitched,")[-1].strip().split(" ")[0]
return self._search_split_roof_description(desc)
if is_roof_room:
desc_split_lookup = {
@ -70,32 +98,37 @@ class EpcClean:
"thatched": "average",
}
# Just search for specific phrases
desc_split = description_lower.split("roof room(s),")[-1].lstrip()
desc_split = description_lower.split("roof room(s),")[-1].strip()
res = desc_split_lookup.get(desc_split)
if res:
return res
desc = desc_split.split(" ")[0]
return self._search_description_options(desc)
return self._search_split_roof_description(desc)
if is_flat:
# Just search for specific phrases
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
return self._search_description_options(desc)
return self._search_split_roof_description(desc)
return None
@staticmethod
def _extract_thermal_transmittence(description_lower):
def _extract_thermal_transmittence(self, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]:
"""
Extracts thermal transmittance from the description.
:param description_lower: Lowercase description.
:return: Tuple containing U-value and unit.
"""
# Find U-value
u_value = re.search(r"(\d+\.\d+)", description_lower)
u_value = re.search(self.U_VALUE_REGEX, description_lower)
if u_value is not None:
u_value = float(u_value.group(1))
else:
u_value = None
# Find unit
unit = re.search(r"(w/m-¦k)", description_lower)
unit = re.search(self.UNIT_REGEX, description_lower)
if unit is not None:
unit = unit.group(1)
else:
@ -103,7 +136,7 @@ class EpcClean:
return u_value, unit
def clean_roof(self, description):
def clean_roof(self, description: str) -> Dict[str, Union[str, bool, int, None]]:
"""
We aim to extract features about the roof, so we can characterise it. We will check:
- If the roof is pitched
@ -111,12 +144,11 @@ class EpcClean:
- if there is a loft
- If it has insulation
- if so, what degree of insulation
-
:param x:
:return:
:param description: Description of the roof.
:return: Dictionary of attributes of the roof.
"""
description_lower = description.lower().lstrip().rstrip()
description_lower = description.lower().strip()
if "another dwelling above" in description_lower or "other premises above" in description_lower:
return {
@ -150,7 +182,7 @@ class EpcClean:
description_lower, is_pitched, is_roof_room, is_flat
)
else:
raise Exception("Implment me 2")
raise NotImplementedError("Not handles this")
attributes = {
"is_pitched": is_pitched,