mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Broken up the cleaning class
This commit is contained in:
parent
872591505f
commit
e927e6d41f
4 changed files with 248 additions and 224 deletions
|
|
@ -1,16 +1,14 @@
|
|||
import re
|
||||
from typing import List, Dict, Any, Union, Tuple, Optional
|
||||
from typing import List, Dict, Any
|
||||
from collections import Counter
|
||||
|
||||
from epc_data.cleaning.Roof import CleanRoof
|
||||
|
||||
|
||||
class EpcClean:
|
||||
"""
|
||||
Container for methods which we utilise for cleaning EPC data
|
||||
"""
|
||||
|
||||
U_VALUE_REGEX = re.compile(r"(\d+\.\d+)")
|
||||
UNIT_REGEX = re.compile(r"(w/m-¦k)")
|
||||
|
||||
CLEANING_FIELDS: List[str] = [
|
||||
"roof-description",
|
||||
"floor-description",
|
||||
|
|
@ -37,13 +35,15 @@ class EpcClean:
|
|||
for field in self.CLEANING_FIELDS:
|
||||
self.unique_vals[field] = Counter([v[field] for v in self.data])
|
||||
|
||||
for description in self.unique_vals["roof-description"].keys():
|
||||
self.cleaned["roof-description"].append(
|
||||
{
|
||||
"original_description": description,
|
||||
**self.clean_roof(description)
|
||||
}
|
||||
)
|
||||
self.clean_roof()
|
||||
|
||||
# for description in self.unique_vals["floor-description"].keys():
|
||||
# self.cleaned["floor-description"].append(
|
||||
# {
|
||||
# "original_description": description,
|
||||
# **self.clean_floor(description)
|
||||
# }
|
||||
# )
|
||||
|
||||
def _init_empty_cleaned_obj(self) -> None:
|
||||
"""
|
||||
|
|
@ -51,206 +51,11 @@ class EpcClean:
|
|||
"""
|
||||
self.cleaned = {field: [] for field in self.CLEANING_FIELDS}
|
||||
|
||||
@staticmethod
|
||||
def _search_split_roof_description(desc: str) -> str:
|
||||
"""
|
||||
Searches roof descriptions and looks for key words, determining a description about the roof's insulation.
|
||||
|
||||
:param desc: Description to be searched.
|
||||
:return: Result of the search.
|
||||
"""
|
||||
if desc == "insulated":
|
||||
return "average"
|
||||
if desc == "limited":
|
||||
return "below average"
|
||||
raise NotImplementedError("Handle me")
|
||||
|
||||
def _find_insulation_thickness(
|
||||
self, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool
|
||||
) -> Union[int, str, None]:
|
||||
"""
|
||||
Finds insulation thickness in the description.
|
||||
|
||||
:param description_lower: Lowercase description.
|
||||
:param is_pitched: Whether the roof is pitched.
|
||||
:param is_roof_room: Whether there is a room in the roof.
|
||||
:param is_flat: Whether the roof is flat.
|
||||
:return: Insulation thickness if found, else None.
|
||||
"""
|
||||
if "no insulation" in description_lower:
|
||||
return 0
|
||||
|
||||
if is_pitched:
|
||||
try:
|
||||
thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip()
|
||||
if "+" in thickness:
|
||||
return thickness
|
||||
try:
|
||||
return int(thickness)
|
||||
except ValueError as int_error:
|
||||
raise ValueError(int_error)
|
||||
except ValueError as _:
|
||||
if "invalid input" in description_lower:
|
||||
return None
|
||||
desc = description_lower.split("pitched,")[-1].strip().split(" ")[0]
|
||||
return self._search_split_roof_description(desc)
|
||||
|
||||
if is_roof_room:
|
||||
desc_split_lookup = {
|
||||
"ceiling insulated": "average",
|
||||
"thatched": "average",
|
||||
}
|
||||
# Just search for specific phrases
|
||||
desc_split = description_lower.split("roof room(s),")[-1].strip()
|
||||
res = desc_split_lookup.get(desc_split)
|
||||
if res:
|
||||
return res
|
||||
|
||||
desc = desc_split.split(" ")[0]
|
||||
return self._search_split_roof_description(desc)
|
||||
|
||||
if is_flat:
|
||||
# Just search for specific phrases
|
||||
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
|
||||
return self._search_split_roof_description(desc)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_thermal_transmittence(self, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]:
|
||||
"""
|
||||
Extracts thermal transmittance from the description.
|
||||
|
||||
:param description_lower: Lowercase description.
|
||||
:return: Tuple containing U-value and unit.
|
||||
"""
|
||||
# Find U-value
|
||||
u_value = re.search(self.U_VALUE_REGEX, description_lower)
|
||||
if u_value is not None:
|
||||
u_value = float(u_value.group(1))
|
||||
else:
|
||||
u_value = None
|
||||
|
||||
# Find unit
|
||||
unit = re.search(self.UNIT_REGEX, description_lower)
|
||||
if unit is not None:
|
||||
unit = unit.group(1)
|
||||
else:
|
||||
unit = None
|
||||
|
||||
return u_value, unit
|
||||
|
||||
@staticmethod
|
||||
def _make_clean_roof_output(
|
||||
is_valid: bool,
|
||||
at_rafters: bool,
|
||||
is_pitched: bool,
|
||||
is_roof_room: bool,
|
||||
has_loft: bool,
|
||||
insulation_thickness: str | int | None,
|
||||
has_dwelling_above: bool,
|
||||
assumed: bool,
|
||||
is_flat: bool,
|
||||
is_thatched: bool,
|
||||
thermal_transmittence: Optional[float],
|
||||
thermal_transmittence_unit: Optional[str]
|
||||
) -> Dict[str, Union[bool, str, None]]:
|
||||
"""
|
||||
Utility function to ensure all the keys are present in the output.
|
||||
|
||||
:param is_valid: True if the roof descrption is valid, False otherwise
|
||||
:param at_rafters: True if the insulation is at the rafters, False otherwise
|
||||
:param is_pitched: True if the roof is pitched, False otherwise
|
||||
:param is_roof_room: True if there is a room in the roof, False otherwise
|
||||
:param has_loft: True if there is a loft, False otherwise
|
||||
:param insulation_thickness: The thickness of the insulation
|
||||
:param has_dwelling_above: True if there is a dwelling above, False otherwise
|
||||
:param assumed: True if the roof type was assumed based on property age, False otherwise
|
||||
:param is_flat: True if the roof is flat, False otherwise
|
||||
:param is_thatched: True if the roof is thatched, False otherwise
|
||||
:param thermal_transmittence: The thermal transmittence value of the roof, if known
|
||||
:param thermal_transmittence_unit: The unit of thermal transmittence, if known
|
||||
:return: A dictionary containing all the information about the roof.
|
||||
"""
|
||||
|
||||
return {
|
||||
"is_valid": is_valid,
|
||||
"at_rafters": at_rafters,
|
||||
"is_pitched": is_pitched,
|
||||
"is_roof_room": is_roof_room,
|
||||
"has_loft": has_loft,
|
||||
"insulation_thickness": insulation_thickness,
|
||||
"has_dwelling_above": has_dwelling_above,
|
||||
"assumed": assumed,
|
||||
"is_flat": is_flat,
|
||||
"is_thatched": is_thatched,
|
||||
"thermal_transmittence": thermal_transmittence,
|
||||
"thermal_transmittence_unit": thermal_transmittence_unit
|
||||
}
|
||||
|
||||
def clean_roof(self, description: str) -> Dict[str, Union[str, bool, int, None]]:
|
||||
"""
|
||||
We aim to extract features about the roof, so we can characterise it. We will check:
|
||||
- If the roof is pitched
|
||||
- If there is a room roof
|
||||
- if there is a loft
|
||||
- If it has insulation
|
||||
- if so, what degree of insulation
|
||||
|
||||
:param description: Description of the roof.
|
||||
:return: Dictionary of attributes of the roof.
|
||||
"""
|
||||
description_lower = description.lower().strip()
|
||||
|
||||
if "another dwelling above" in description_lower or "other premises above" in description_lower:
|
||||
return self._make_clean_roof_output(
|
||||
is_valid="invalid" not in description_lower,
|
||||
at_rafters="at rafters" in description_lower,
|
||||
is_pitched=False,
|
||||
is_roof_room=False,
|
||||
has_loft=False,
|
||||
insulation_thickness=0,
|
||||
has_dwelling_above=True,
|
||||
assumed="assumed" in description_lower,
|
||||
is_flat="flat" in description_lower,
|
||||
is_thatched=False,
|
||||
thermal_transmittence=None,
|
||||
thermal_transmittence_unit=None
|
||||
def clean_roof(self):
|
||||
for description in self.unique_vals["roof-description"].keys():
|
||||
self.cleaned["roof-description"].append(
|
||||
{
|
||||
"original_description": description,
|
||||
**CleanRoof(description).clean()
|
||||
}
|
||||
)
|
||||
|
||||
is_pitched = "pitched" in description_lower
|
||||
is_roof_room = "roof room" in description_lower
|
||||
has_loft = "loft" in description_lower
|
||||
is_flat = "flat" in description_lower
|
||||
is_thatched = "thatched" in description_lower
|
||||
at_rafters = "at rafters" in description_lower
|
||||
|
||||
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
|
||||
if "insulation" in description_lower or "insulated" in description_lower:
|
||||
insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat)
|
||||
elif "thermal transmittance" in description_lower:
|
||||
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
||||
elif is_thatched:
|
||||
# Search for these features:
|
||||
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
||||
insulation_thickness = self._find_insulation_thickness(
|
||||
description_lower, is_pitched, is_roof_room, is_flat
|
||||
)
|
||||
elif description_lower == "pitched":
|
||||
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
|
||||
else:
|
||||
raise NotImplementedError("Not handled this")
|
||||
|
||||
return self._make_clean_roof_output(
|
||||
is_valid="invalid" not in description_lower,
|
||||
at_rafters=at_rafters,
|
||||
is_pitched=is_pitched,
|
||||
is_roof_room=is_roof_room,
|
||||
has_loft=has_loft,
|
||||
insulation_thickness=insulation_thickness,
|
||||
has_dwelling_above=False,
|
||||
assumed="assumed" in description_lower,
|
||||
is_flat=is_flat,
|
||||
is_thatched=is_thatched,
|
||||
thermal_transmittence=thermal_transmittence,
|
||||
thermal_transmittence_unit=thermal_transmittence_unit
|
||||
)
|
||||
|
|
|
|||
|
|
@ -38,5 +38,5 @@ def handler():
|
|||
|
||||
cleaner.clean()
|
||||
|
||||
import pandas as pd
|
||||
df = pd.DataFrame(cleaner.cleaned["roof-description"])
|
||||
|
||||
|
||||
|
|
|
|||
218
epc_data/cleaning/Roof.py
Normal file
218
epc_data/cleaning/Roof.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
import re
|
||||
from typing import Dict, Union, Tuple, Optional
|
||||
|
||||
|
||||
class CleanRoof:
|
||||
U_VALUE_REGEX = re.compile(r"(\d+\.\d+)")
|
||||
UNIT_REGEX = re.compile(r"(w/m-¦k)")
|
||||
|
||||
def __init__(self, description):
|
||||
"""
|
||||
:param description: Description of the roof.
|
||||
"""
|
||||
self.description: str = description
|
||||
|
||||
def clean(self) -> Dict[str, Union[str, bool, int, None]]:
|
||||
"""
|
||||
We aim to extract features about the roof, so we can characterise it. We will check:
|
||||
- If the roof is pitched
|
||||
- If there is a room roof
|
||||
- if there is a loft
|
||||
- If it has insulation
|
||||
- if so, what degree of insulation
|
||||
|
||||
:return: Dictionary of attributes of the roof.
|
||||
"""
|
||||
description_lower = self.description.lower().strip()
|
||||
|
||||
if "another dwelling above" in description_lower or "other premises above" in description_lower:
|
||||
return self._make_clean_output(
|
||||
is_valid="invalid" not in description_lower,
|
||||
at_rafters="at rafters" in description_lower,
|
||||
is_pitched=False,
|
||||
is_roof_room=False,
|
||||
has_loft=False,
|
||||
insulation_thickness=0,
|
||||
has_dwelling_above=True,
|
||||
assumed="assumed" in description_lower,
|
||||
is_flat="flat" in description_lower,
|
||||
is_thatched=False,
|
||||
thermal_transmittence=None,
|
||||
thermal_transmittence_unit=None
|
||||
)
|
||||
|
||||
is_pitched = "pitched" in description_lower
|
||||
is_roof_room = "roof room" in description_lower
|
||||
has_loft = "loft" in description_lower
|
||||
is_flat = "flat" in description_lower
|
||||
is_thatched = "thatched" in description_lower
|
||||
at_rafters = "at rafters" in description_lower
|
||||
|
||||
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
|
||||
if "insulation" in description_lower or "insulated" in description_lower:
|
||||
insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat)
|
||||
elif "thermal transmittance" in description_lower:
|
||||
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
||||
elif is_thatched:
|
||||
# Search for these features:
|
||||
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
||||
insulation_thickness = self._find_insulation_thickness(
|
||||
description_lower, is_pitched, is_roof_room, is_flat
|
||||
)
|
||||
elif description_lower == "pitched":
|
||||
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
|
||||
else:
|
||||
raise NotImplementedError("Not handled this")
|
||||
|
||||
return self._make_clean_output(
|
||||
is_valid="invalid" not in description_lower,
|
||||
at_rafters=at_rafters,
|
||||
is_pitched=is_pitched,
|
||||
is_roof_room=is_roof_room,
|
||||
has_loft=has_loft,
|
||||
insulation_thickness=insulation_thickness,
|
||||
has_dwelling_above=False,
|
||||
assumed="assumed" in description_lower,
|
||||
is_flat=is_flat,
|
||||
is_thatched=is_thatched,
|
||||
thermal_transmittence=thermal_transmittence,
|
||||
thermal_transmittence_unit=thermal_transmittence_unit
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _make_clean_output(
|
||||
is_valid: bool,
|
||||
at_rafters: bool,
|
||||
is_pitched: bool,
|
||||
is_roof_room: bool,
|
||||
has_loft: bool,
|
||||
insulation_thickness: str | int | None,
|
||||
has_dwelling_above: bool,
|
||||
assumed: bool,
|
||||
is_flat: bool,
|
||||
is_thatched: bool,
|
||||
thermal_transmittence: Optional[float],
|
||||
thermal_transmittence_unit: Optional[str]
|
||||
) -> Dict[str, Union[bool, str, None]]:
|
||||
"""
|
||||
Utility function to ensure all the keys are present in the output.
|
||||
|
||||
:param is_valid: True if the roof descrption is valid, False otherwise
|
||||
:param at_rafters: True if the insulation is at the rafters, False otherwise
|
||||
:param is_pitched: True if the roof is pitched, False otherwise
|
||||
:param is_roof_room: True if there is a room in the roof, False otherwise
|
||||
:param has_loft: True if there is a loft, False otherwise
|
||||
:param insulation_thickness: The thickness of the insulation
|
||||
:param has_dwelling_above: True if there is a dwelling above, False otherwise
|
||||
:param assumed: True if the roof type was assumed based on property age, False otherwise
|
||||
:param is_flat: True if the roof is flat, False otherwise
|
||||
:param is_thatched: True if the roof is thatched, False otherwise
|
||||
:param thermal_transmittence: The thermal transmittence value of the roof, if known
|
||||
:param thermal_transmittence_unit: The unit of thermal transmittence, if known
|
||||
:return: A dictionary containing all the information about the roof.
|
||||
"""
|
||||
|
||||
return {
|
||||
"is_valid": is_valid,
|
||||
"at_rafters": at_rafters,
|
||||
"is_pitched": is_pitched,
|
||||
"is_roof_room": is_roof_room,
|
||||
"has_loft": has_loft,
|
||||
"insulation_thickness": insulation_thickness,
|
||||
"has_dwelling_above": has_dwelling_above,
|
||||
"assumed": assumed,
|
||||
"is_flat": is_flat,
|
||||
"is_thatched": is_thatched,
|
||||
"thermal_transmittence": thermal_transmittence,
|
||||
"thermal_transmittence_unit": thermal_transmittence_unit
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _search_split_description(desc: str) -> str:
|
||||
"""
|
||||
Searches roof descriptions and looks for key words, determining a description about the roof's insulation.
|
||||
|
||||
:param desc: Description to be searched.
|
||||
:return: Result of the search.
|
||||
"""
|
||||
if desc == "insulated":
|
||||
return "average"
|
||||
if desc == "limited":
|
||||
return "below average"
|
||||
raise NotImplementedError("Handle me")
|
||||
|
||||
@classmethod
|
||||
def _find_insulation_thickness(
|
||||
cls, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool
|
||||
) -> Union[int, str, None]:
|
||||
"""
|
||||
Finds insulation thickness in the description.
|
||||
|
||||
:param description_lower: Lowercase description.
|
||||
:param is_pitched: Whether the roof is pitched.
|
||||
:param is_roof_room: Whether there is a room in the roof.
|
||||
:param is_flat: Whether the roof is flat.
|
||||
:return: Insulation thickness if found, else None.
|
||||
"""
|
||||
if "no insulation" in description_lower:
|
||||
return 0
|
||||
|
||||
if is_pitched:
|
||||
try:
|
||||
thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip()
|
||||
if "+" in thickness:
|
||||
return thickness
|
||||
try:
|
||||
return int(thickness)
|
||||
except ValueError as int_error:
|
||||
raise ValueError(int_error)
|
||||
except ValueError as _:
|
||||
if "invalid input" in description_lower:
|
||||
return None
|
||||
desc = description_lower.split("pitched,")[-1].strip().split(" ")[0]
|
||||
return cls._search_split_description(desc)
|
||||
|
||||
if is_roof_room:
|
||||
desc_split_lookup = {
|
||||
"ceiling insulated": "average",
|
||||
"thatched": "average",
|
||||
}
|
||||
# Just search for specific phrases
|
||||
desc_split = description_lower.split("roof room(s),")[-1].strip()
|
||||
res = desc_split_lookup.get(desc_split)
|
||||
if res:
|
||||
return res
|
||||
|
||||
desc = desc_split.split(" ")[0]
|
||||
return cls._search_split_description(desc)
|
||||
|
||||
if is_flat:
|
||||
# Just search for specific phrases
|
||||
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
|
||||
return cls._search_split_description(desc)
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _extract_thermal_transmittence(cls, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]:
|
||||
"""
|
||||
Extracts thermal transmittance from the description.
|
||||
|
||||
:param description_lower: Lowercase description.
|
||||
:return: Tuple containing U-value and unit.
|
||||
"""
|
||||
# Find U-value
|
||||
u_value = re.search(cls.U_VALUE_REGEX, description_lower)
|
||||
if u_value is not None:
|
||||
u_value = float(u_value.group(1))
|
||||
else:
|
||||
u_value = None
|
||||
|
||||
# Find unit
|
||||
unit = re.search(cls.UNIT_REGEX, description_lower)
|
||||
if unit is not None:
|
||||
unit = unit.group(1)
|
||||
else:
|
||||
unit = None
|
||||
|
||||
return u_value, unit
|
||||
|
|
@ -3,6 +3,7 @@ import pickle
|
|||
from epc_data.EpcClean import EpcClean
|
||||
from pathlib import Path
|
||||
from epc_data.tests.test_data.EpcClean_test_roof_cases import clean_roof_test_cases
|
||||
from epc_data.cleaning.Roof import CleanRoof
|
||||
|
||||
# For local testing
|
||||
if __file__ == "<input>":
|
||||
|
|
@ -32,20 +33,20 @@ class TestEpcClean:
|
|||
assert all([len(values) == 0 for values in self.cleaner.cleaned.values()])
|
||||
|
||||
def test__search_split_roof_description(self):
|
||||
assert self.cleaner._search_split_roof_description("insulated") == "average"
|
||||
assert self.cleaner._search_split_roof_description("limited") == "below average"
|
||||
assert CleanRoof._search_split_description("insulated") == "average"
|
||||
assert CleanRoof._search_split_description("limited") == "below average"
|
||||
with pytest.raises(NotImplementedError):
|
||||
self.cleaner._search_split_roof_description("unknown")
|
||||
CleanRoof._search_split_description("unknown")
|
||||
|
||||
def test__find_insulation_thickness(self):
|
||||
assert self.cleaner._find_insulation_thickness("no insulation", False, False, False) == 0
|
||||
assert CleanRoof._find_insulation_thickness("no insulation", False, False, False) == 0
|
||||
|
||||
def test__extract_thermal_transmittence(self):
|
||||
description = "U-value of 2.3 w/m-¦k"
|
||||
assert self.cleaner._extract_thermal_transmittence(description) == (2.3, "w/m-¦k")
|
||||
assert CleanRoof._extract_thermal_transmittence(description) == (2.3, "w/m-¦k")
|
||||
|
||||
def test_clean_roof(self):
|
||||
result = self.cleaner.clean_roof('Pitched, 270 mm loft insulation')
|
||||
result = CleanRoof('Pitched, 270 mm loft insulation').clean()
|
||||
|
||||
# change the expected output based on your requirement
|
||||
expected_output = {
|
||||
|
|
@ -66,7 +67,7 @@ class TestEpcClean:
|
|||
assert result == expected_output
|
||||
|
||||
for test_case in clean_roof_test_cases:
|
||||
result = self.cleaner.clean_roof(test_case['original_description'])
|
||||
result = CleanRoof(test_case['original_description']).clean()
|
||||
# Ensure the output ordering is correct
|
||||
expected_result = {key: test_case[key] for key in result.keys()}
|
||||
expected_result["desc"] = test_case["original_description"]
|
||||
|
|
@ -74,7 +75,7 @@ class TestEpcClean:
|
|||
assert result == expected_result
|
||||
|
||||
def test_clean_roof_with_dwelling_above(self):
|
||||
result = self.cleaner.clean_roof('(another dwelling above)')
|
||||
result = CleanRoof('(another dwelling above)').clean()
|
||||
|
||||
expected_output = {
|
||||
"is_valid": True,
|
||||
Loading…
Add table
Reference in a new issue