Broken up the cleaning class

This commit is contained in:
Khalim Conn-Kowlessar 2023-06-08 16:46:52 +01:00
parent 872591505f
commit e927e6d41f
4 changed files with 248 additions and 224 deletions

View file

@ -1,16 +1,14 @@
import re
from typing import List, Dict, Any, Union, Tuple, Optional
from typing import List, Dict, Any
from collections import Counter
from epc_data.cleaning.Roof import CleanRoof
class EpcClean:
"""
Container for methods which we utilise for cleaning EPC data
"""
U_VALUE_REGEX = re.compile(r"(\d+\.\d+)")
UNIT_REGEX = re.compile(r"(w/m-¦k)")
CLEANING_FIELDS: List[str] = [
"roof-description",
"floor-description",
@ -37,13 +35,15 @@ class EpcClean:
for field in self.CLEANING_FIELDS:
self.unique_vals[field] = Counter([v[field] for v in self.data])
for description in self.unique_vals["roof-description"].keys():
self.cleaned["roof-description"].append(
{
"original_description": description,
**self.clean_roof(description)
}
)
self.clean_roof()
# for description in self.unique_vals["floor-description"].keys():
# self.cleaned["floor-description"].append(
# {
# "original_description": description,
# **self.clean_floor(description)
# }
# )
def _init_empty_cleaned_obj(self) -> None:
"""
@ -51,206 +51,11 @@ class EpcClean:
"""
self.cleaned = {field: [] for field in self.CLEANING_FIELDS}
@staticmethod
def _search_split_roof_description(desc: str) -> str:
"""
Searches roof descriptions and looks for key words, determining a description about the roof's insulation.
:param desc: Description to be searched.
:return: Result of the search.
"""
if desc == "insulated":
return "average"
if desc == "limited":
return "below average"
raise NotImplementedError("Handle me")
def _find_insulation_thickness(
self, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool
) -> Union[int, str, None]:
"""
Finds insulation thickness in the description.
:param description_lower: Lowercase description.
:param is_pitched: Whether the roof is pitched.
:param is_roof_room: Whether there is a room in the roof.
:param is_flat: Whether the roof is flat.
:return: Insulation thickness if found, else None.
"""
if "no insulation" in description_lower:
return 0
if is_pitched:
try:
thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip()
if "+" in thickness:
return thickness
try:
return int(thickness)
except ValueError as int_error:
raise ValueError(int_error)
except ValueError as _:
if "invalid input" in description_lower:
return None
desc = description_lower.split("pitched,")[-1].strip().split(" ")[0]
return self._search_split_roof_description(desc)
if is_roof_room:
desc_split_lookup = {
"ceiling insulated": "average",
"thatched": "average",
}
# Just search for specific phrases
desc_split = description_lower.split("roof room(s),")[-1].strip()
res = desc_split_lookup.get(desc_split)
if res:
return res
desc = desc_split.split(" ")[0]
return self._search_split_roof_description(desc)
if is_flat:
# Just search for specific phrases
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
return self._search_split_roof_description(desc)
return None
def _extract_thermal_transmittence(self, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]:
"""
Extracts thermal transmittance from the description.
:param description_lower: Lowercase description.
:return: Tuple containing U-value and unit.
"""
# Find U-value
u_value = re.search(self.U_VALUE_REGEX, description_lower)
if u_value is not None:
u_value = float(u_value.group(1))
else:
u_value = None
# Find unit
unit = re.search(self.UNIT_REGEX, description_lower)
if unit is not None:
unit = unit.group(1)
else:
unit = None
return u_value, unit
@staticmethod
def _make_clean_roof_output(
is_valid: bool,
at_rafters: bool,
is_pitched: bool,
is_roof_room: bool,
has_loft: bool,
insulation_thickness: str | int | None,
has_dwelling_above: bool,
assumed: bool,
is_flat: bool,
is_thatched: bool,
thermal_transmittence: Optional[float],
thermal_transmittence_unit: Optional[str]
) -> Dict[str, Union[bool, str, None]]:
"""
Utility function to ensure all the keys are present in the output.
:param is_valid: True if the roof descrption is valid, False otherwise
:param at_rafters: True if the insulation is at the rafters, False otherwise
:param is_pitched: True if the roof is pitched, False otherwise
:param is_roof_room: True if there is a room in the roof, False otherwise
:param has_loft: True if there is a loft, False otherwise
:param insulation_thickness: The thickness of the insulation
:param has_dwelling_above: True if there is a dwelling above, False otherwise
:param assumed: True if the roof type was assumed based on property age, False otherwise
:param is_flat: True if the roof is flat, False otherwise
:param is_thatched: True if the roof is thatched, False otherwise
:param thermal_transmittence: The thermal transmittence value of the roof, if known
:param thermal_transmittence_unit: The unit of thermal transmittence, if known
:return: A dictionary containing all the information about the roof.
"""
return {
"is_valid": is_valid,
"at_rafters": at_rafters,
"is_pitched": is_pitched,
"is_roof_room": is_roof_room,
"has_loft": has_loft,
"insulation_thickness": insulation_thickness,
"has_dwelling_above": has_dwelling_above,
"assumed": assumed,
"is_flat": is_flat,
"is_thatched": is_thatched,
"thermal_transmittence": thermal_transmittence,
"thermal_transmittence_unit": thermal_transmittence_unit
}
def clean_roof(self, description: str) -> Dict[str, Union[str, bool, int, None]]:
"""
We aim to extract features about the roof, so we can characterise it. We will check:
- If the roof is pitched
- If there is a room roof
- if there is a loft
- If it has insulation
- if so, what degree of insulation
:param description: Description of the roof.
:return: Dictionary of attributes of the roof.
"""
description_lower = description.lower().strip()
if "another dwelling above" in description_lower or "other premises above" in description_lower:
return self._make_clean_roof_output(
is_valid="invalid" not in description_lower,
at_rafters="at rafters" in description_lower,
is_pitched=False,
is_roof_room=False,
has_loft=False,
insulation_thickness=0,
has_dwelling_above=True,
assumed="assumed" in description_lower,
is_flat="flat" in description_lower,
is_thatched=False,
thermal_transmittence=None,
thermal_transmittence_unit=None
def clean_roof(self):
for description in self.unique_vals["roof-description"].keys():
self.cleaned["roof-description"].append(
{
"original_description": description,
**CleanRoof(description).clean()
}
)
is_pitched = "pitched" in description_lower
is_roof_room = "roof room" in description_lower
has_loft = "loft" in description_lower
is_flat = "flat" in description_lower
is_thatched = "thatched" in description_lower
at_rafters = "at rafters" in description_lower
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
if "insulation" in description_lower or "insulated" in description_lower:
insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat)
elif "thermal transmittance" in description_lower:
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
elif is_thatched:
# Search for these features:
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
insulation_thickness = self._find_insulation_thickness(
description_lower, is_pitched, is_roof_room, is_flat
)
elif description_lower == "pitched":
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
else:
raise NotImplementedError("Not handled this")
return self._make_clean_roof_output(
is_valid="invalid" not in description_lower,
at_rafters=at_rafters,
is_pitched=is_pitched,
is_roof_room=is_roof_room,
has_loft=has_loft,
insulation_thickness=insulation_thickness,
has_dwelling_above=False,
assumed="assumed" in description_lower,
is_flat=is_flat,
is_thatched=is_thatched,
thermal_transmittence=thermal_transmittence,
thermal_transmittence_unit=thermal_transmittence_unit
)

View file

@ -38,5 +38,5 @@ def handler():
cleaner.clean()
import pandas as pd
df = pd.DataFrame(cleaner.cleaned["roof-description"])

218
epc_data/cleaning/Roof.py Normal file
View file

@ -0,0 +1,218 @@
import re
from typing import Dict, Union, Tuple, Optional
class CleanRoof:
U_VALUE_REGEX = re.compile(r"(\d+\.\d+)")
UNIT_REGEX = re.compile(r"(w/m-¦k)")
def __init__(self, description):
"""
:param description: Description of the roof.
"""
self.description: str = description
def clean(self) -> Dict[str, Union[str, bool, int, None]]:
"""
We aim to extract features about the roof, so we can characterise it. We will check:
- If the roof is pitched
- If there is a room roof
- if there is a loft
- If it has insulation
- if so, what degree of insulation
:return: Dictionary of attributes of the roof.
"""
description_lower = self.description.lower().strip()
if "another dwelling above" in description_lower or "other premises above" in description_lower:
return self._make_clean_output(
is_valid="invalid" not in description_lower,
at_rafters="at rafters" in description_lower,
is_pitched=False,
is_roof_room=False,
has_loft=False,
insulation_thickness=0,
has_dwelling_above=True,
assumed="assumed" in description_lower,
is_flat="flat" in description_lower,
is_thatched=False,
thermal_transmittence=None,
thermal_transmittence_unit=None
)
is_pitched = "pitched" in description_lower
is_roof_room = "roof room" in description_lower
has_loft = "loft" in description_lower
is_flat = "flat" in description_lower
is_thatched = "thatched" in description_lower
at_rafters = "at rafters" in description_lower
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
if "insulation" in description_lower or "insulated" in description_lower:
insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat)
elif "thermal transmittance" in description_lower:
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
elif is_thatched:
# Search for these features:
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
insulation_thickness = self._find_insulation_thickness(
description_lower, is_pitched, is_roof_room, is_flat
)
elif description_lower == "pitched":
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
else:
raise NotImplementedError("Not handled this")
return self._make_clean_output(
is_valid="invalid" not in description_lower,
at_rafters=at_rafters,
is_pitched=is_pitched,
is_roof_room=is_roof_room,
has_loft=has_loft,
insulation_thickness=insulation_thickness,
has_dwelling_above=False,
assumed="assumed" in description_lower,
is_flat=is_flat,
is_thatched=is_thatched,
thermal_transmittence=thermal_transmittence,
thermal_transmittence_unit=thermal_transmittence_unit
)
@staticmethod
def _make_clean_output(
is_valid: bool,
at_rafters: bool,
is_pitched: bool,
is_roof_room: bool,
has_loft: bool,
insulation_thickness: str | int | None,
has_dwelling_above: bool,
assumed: bool,
is_flat: bool,
is_thatched: bool,
thermal_transmittence: Optional[float],
thermal_transmittence_unit: Optional[str]
) -> Dict[str, Union[bool, str, None]]:
"""
Utility function to ensure all the keys are present in the output.
:param is_valid: True if the roof descrption is valid, False otherwise
:param at_rafters: True if the insulation is at the rafters, False otherwise
:param is_pitched: True if the roof is pitched, False otherwise
:param is_roof_room: True if there is a room in the roof, False otherwise
:param has_loft: True if there is a loft, False otherwise
:param insulation_thickness: The thickness of the insulation
:param has_dwelling_above: True if there is a dwelling above, False otherwise
:param assumed: True if the roof type was assumed based on property age, False otherwise
:param is_flat: True if the roof is flat, False otherwise
:param is_thatched: True if the roof is thatched, False otherwise
:param thermal_transmittence: The thermal transmittence value of the roof, if known
:param thermal_transmittence_unit: The unit of thermal transmittence, if known
:return: A dictionary containing all the information about the roof.
"""
return {
"is_valid": is_valid,
"at_rafters": at_rafters,
"is_pitched": is_pitched,
"is_roof_room": is_roof_room,
"has_loft": has_loft,
"insulation_thickness": insulation_thickness,
"has_dwelling_above": has_dwelling_above,
"assumed": assumed,
"is_flat": is_flat,
"is_thatched": is_thatched,
"thermal_transmittence": thermal_transmittence,
"thermal_transmittence_unit": thermal_transmittence_unit
}
@staticmethod
def _search_split_description(desc: str) -> str:
"""
Searches roof descriptions and looks for key words, determining a description about the roof's insulation.
:param desc: Description to be searched.
:return: Result of the search.
"""
if desc == "insulated":
return "average"
if desc == "limited":
return "below average"
raise NotImplementedError("Handle me")
@classmethod
def _find_insulation_thickness(
cls, description_lower: str, is_pitched: bool, is_roof_room: bool, is_flat: bool
) -> Union[int, str, None]:
"""
Finds insulation thickness in the description.
:param description_lower: Lowercase description.
:param is_pitched: Whether the roof is pitched.
:param is_roof_room: Whether there is a room in the roof.
:param is_flat: Whether the roof is flat.
:return: Insulation thickness if found, else None.
"""
if "no insulation" in description_lower:
return 0
if is_pitched:
try:
thickness = description_lower.split("pitched,")[-1].split("mm")[0].strip()
if "+" in thickness:
return thickness
try:
return int(thickness)
except ValueError as int_error:
raise ValueError(int_error)
except ValueError as _:
if "invalid input" in description_lower:
return None
desc = description_lower.split("pitched,")[-1].strip().split(" ")[0]
return cls._search_split_description(desc)
if is_roof_room:
desc_split_lookup = {
"ceiling insulated": "average",
"thatched": "average",
}
# Just search for specific phrases
desc_split = description_lower.split("roof room(s),")[-1].strip()
res = desc_split_lookup.get(desc_split)
if res:
return res
desc = desc_split.split(" ")[0]
return cls._search_split_description(desc)
if is_flat:
# Just search for specific phrases
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
return cls._search_split_description(desc)
return None
@classmethod
def _extract_thermal_transmittence(cls, description_lower: str) -> Tuple[Union[float, None], Union[str, None]]:
"""
Extracts thermal transmittance from the description.
:param description_lower: Lowercase description.
:return: Tuple containing U-value and unit.
"""
# Find U-value
u_value = re.search(cls.U_VALUE_REGEX, description_lower)
if u_value is not None:
u_value = float(u_value.group(1))
else:
u_value = None
# Find unit
unit = re.search(cls.UNIT_REGEX, description_lower)
if unit is not None:
unit = unit.group(1)
else:
unit = None
return u_value, unit

View file

@ -3,6 +3,7 @@ import pickle
from epc_data.EpcClean import EpcClean
from pathlib import Path
from epc_data.tests.test_data.EpcClean_test_roof_cases import clean_roof_test_cases
from epc_data.cleaning.Roof import CleanRoof
# For local testing
if __file__ == "<input>":
@ -32,20 +33,20 @@ class TestEpcClean:
assert all([len(values) == 0 for values in self.cleaner.cleaned.values()])
def test__search_split_roof_description(self):
assert self.cleaner._search_split_roof_description("insulated") == "average"
assert self.cleaner._search_split_roof_description("limited") == "below average"
assert CleanRoof._search_split_description("insulated") == "average"
assert CleanRoof._search_split_description("limited") == "below average"
with pytest.raises(NotImplementedError):
self.cleaner._search_split_roof_description("unknown")
CleanRoof._search_split_description("unknown")
def test__find_insulation_thickness(self):
assert self.cleaner._find_insulation_thickness("no insulation", False, False, False) == 0
assert CleanRoof._find_insulation_thickness("no insulation", False, False, False) == 0
def test__extract_thermal_transmittence(self):
description = "U-value of 2.3 w/m-¦k"
assert self.cleaner._extract_thermal_transmittence(description) == (2.3, "w/m-¦k")
assert CleanRoof._extract_thermal_transmittence(description) == (2.3, "w/m-¦k")
def test_clean_roof(self):
result = self.cleaner.clean_roof('Pitched, 270 mm loft insulation')
result = CleanRoof('Pitched, 270 mm loft insulation').clean()
# change the expected output based on your requirement
expected_output = {
@ -66,7 +67,7 @@ class TestEpcClean:
assert result == expected_output
for test_case in clean_roof_test_cases:
result = self.cleaner.clean_roof(test_case['original_description'])
result = CleanRoof(test_case['original_description']).clean()
# Ensure the output ordering is correct
expected_result = {key: test_case[key] for key in result.keys()}
expected_result["desc"] = test_case["original_description"]
@ -74,7 +75,7 @@ class TestEpcClean:
assert result == expected_result
def test_clean_roof_with_dwelling_above(self):
result = self.cleaner.clean_roof('(another dwelling above)')
result = CleanRoof('(another dwelling above)').clean()
expected_output = {
"is_valid": True,