mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
199 lines
7.2 KiB
Python
199 lines
7.2 KiB
Python
import re
|
|
from typing import List, Dict, Any, Union, Tuple
|
|
from collections import Counter
|
|
|
|
|
|
class EpcClean:
|
|
"""
|
|
Container for methods which we utilise for cleaning EPC data
|
|
"""
|
|
|
|
U_VALUE_REGEX = re.compile(r"(\d+\.\d+)")
|
|
UNIT_REGEX = re.compile(r"(w/m-¦k)")
|
|
|
|
CLEANING_FIELDS: List[str] = [
|
|
"roof-description",
|
|
"floor-description",
|
|
"walls-description",
|
|
"mainheat-description"
|
|
]
|
|
|
|
def __init__(self, data: List[Dict[str, Any]]) -> None:
|
|
"""
|
|
EpcClean constructor.
|
|
|
|
:param data: List of dictionaries containing EPC data.
|
|
"""
|
|
self.data: List[Dict[str, Any]] = data
|
|
self.unique_vals: Dict[str, Any] = {}
|
|
self.cleaned: Dict[str, List[Any]] = {}
|
|
|
|
def clean(self) -> None:
|
|
"""
|
|
Cleans the EPC data, mapping text fields to property attributes.
|
|
"""
|
|
self._init_empty_cleaned_obj()
|
|
|
|
for field in self.CLEANING_FIELDS:
|
|
self.unique_vals[field] = Counter([v[field] for v in self.data])
|
|
|
|
for description in self.unique_vals["roof-description"].keys():
|
|
self.cleaned["roof-description"].append(
|
|
{"original": description, "cleaned": self.clean_roof(description)}
|
|
)
|
|
|
|
def _init_empty_cleaned_obj(self) -> None:
|
|
"""
|
|
Initializes an empty object for cleaned data.
|
|
"""
|
|
self.cleaned = {field: [] for field in self.CLEANING_FIELDS}
|
|
|
|
@staticmethod
|
|
def _search_split_roof_description(desc: str) -> str:
|
|
"""
|
|
Searches roof descriptions and looks for key words, determining a description about the roof's insulation.
|
|
|
|
:param desc: Description to be searched.
|
|
:return: Result of the search.
|
|
"""
|
|
if desc == "insulated":
|
|
return "average"
|
|
if desc == "limited":
|
|
return "below average"
|
|
raise NotImplementedError("Handle me")
|
|
|
|
def _find_insulation_thickness(
|
|
self, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool
|
|
) -> Union[int, str, None]:
|
|
"""
|
|
Finds insulation thickness in the description.
|
|
|
|
:param description_lower: Lowercase description.
|
|
:param is_pitched: Whether the roof is pitched.
|
|
:param is_roof_room: Whether there is a room in the roof.
|
|
:param is_flat: Whether the roof is flat.
|
|
:return: Insulation thickness if found, else None.
|
|
"""
|
|
if "no insulation" in description_lower:
|
|
return 0
|
|
|
|
if is_pitched:
|
|
try:
|
|
thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip()
|
|
if "+" in thickness:
|
|
return thickness
|
|
try:
|
|
return int(thickness)
|
|
except ValueError as int_error:
|
|
raise ValueError(int_error)
|
|
except ValueError as _:
|
|
if "invalid input" in description_lower:
|
|
return None
|
|
desc = description_lower.split("pitched,")[-1].strip().split(" ")[0]
|
|
return self._search_split_roof_description(desc)
|
|
|
|
if is_roof_room:
|
|
desc_split_lookup = {
|
|
"ceiling insulated": "average",
|
|
"thatched": "average",
|
|
}
|
|
# Just search for specific phrases
|
|
desc_split = description_lower.split("roof room(s),")[-1].strip()
|
|
res = desc_split_lookup.get(desc_split)
|
|
if res:
|
|
return res
|
|
|
|
desc = desc_split.split(" ")[0]
|
|
return self._search_split_roof_description(desc)
|
|
|
|
if is_flat:
|
|
# Just search for specific phrases
|
|
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
|
|
return self._search_split_roof_description(desc)
|
|
|
|
return None
|
|
|
|
def _extract_thermal_transmittence(self, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]:
|
|
"""
|
|
Extracts thermal transmittance from the description.
|
|
|
|
:param description_lower: Lowercase description.
|
|
:return: Tuple containing U-value and unit.
|
|
"""
|
|
# Find U-value
|
|
u_value = re.search(self.U_VALUE_REGEX, description_lower)
|
|
if u_value is not None:
|
|
u_value = float(u_value.group(1))
|
|
else:
|
|
u_value = None
|
|
|
|
# Find unit
|
|
unit = re.search(self.UNIT_REGEX, description_lower)
|
|
if unit is not None:
|
|
unit = unit.group(1)
|
|
else:
|
|
unit = None
|
|
|
|
return u_value, unit
|
|
|
|
def clean_roof(self, description: str) -> Dict[str, Union[str, bool, int, None]]:
|
|
"""
|
|
We aim to extract features about the roof, so we can characterise it. We will check:
|
|
- If the roof is pitched
|
|
- If there is a room roof
|
|
- if there is a loft
|
|
- If it has insulation
|
|
- if so, what degree of insulation
|
|
|
|
:param description: Description of the roof.
|
|
:return: Dictionary of attributes of the roof.
|
|
"""
|
|
description_lower = description.lower().strip()
|
|
|
|
if "another dwelling above" in description_lower or "other premises above" in description_lower:
|
|
return {
|
|
"is_pitched": False,
|
|
"is_roof_room": False,
|
|
"has_loft": False,
|
|
"insulation_thickness": 0,
|
|
"has_dwelling_above": True,
|
|
"assumed": "assumed" in description_lower,
|
|
"is_flat": "flat" in description_lower,
|
|
"is_thatched": False,
|
|
"thermal_transmittence": None,
|
|
"thermal_transmittence_unit": None,
|
|
}
|
|
|
|
is_pitched = "pitched" in description_lower
|
|
is_roof_room = "roof room" in description_lower
|
|
has_loft = "loft" in description_lower
|
|
is_flat = "flat" in description_lower
|
|
is_thatched = "thatched" in description_lower
|
|
|
|
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
|
|
if "insulation" in description_lower or "insulated" in description_lower:
|
|
insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat)
|
|
elif "thermal transmittance" in description_lower:
|
|
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
|
elif is_thatched:
|
|
# Search for these features:
|
|
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
|
insulation_thickness = self._find_insulation_thickness(
|
|
description_lower, is_pitched, is_roof_room, is_flat
|
|
)
|
|
else:
|
|
raise NotImplementedError("Not handles this")
|
|
|
|
attributes = {
|
|
"is_pitched": is_pitched,
|
|
"is_roof_room": is_roof_room,
|
|
"has_loft": has_loft,
|
|
"insulation_thickness": insulation_thickness,
|
|
"has_dwelling_above": False,
|
|
"assumed": "assumed" in description_lower,
|
|
"is_flat": is_flat,
|
|
"thermal_transmittence": thermal_transmittence,
|
|
"thermal_transmittence_unit": thermal_transmittence_unit
|
|
}
|
|
|
|
return attributes
|