Model/etl/epc_clean/epc_attributes/RoofAttributes.py
Khalim Conn-Kowlessar 0e8136d445 debugging epc cleaning
2024-09-27 16:18:31 +01:00

169 lines
8 KiB
Python

import re
from typing import Dict, Union
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
class RoofAttributes(Definitions):
ROOF_TYPES = ['pitched', 'roof room', 'loft', 'flat', 'thatched', 'at rafters', 'assumed']
DWELLING_ABOVE = ["another dwelling above", "other premises above", "other dwelling above"]
WELSH_TEXT = {
"ar oleddf, dim inswleiddio": "pitched, no insulation",
"ar oleddf, dim inswleiddio (rhagdybiaeth)": "pitched, no insulation (assumed)",
"ar oleddf, wedigçöi inswleiddio (rhagdybiaeth)": "pitched, insulated (assumed)",
"ar oleddf, wedi?i inswleiddio (rhagdybiaeth)": "pitched, insulated (assumed)",
"ar oleddf, wedigçöi hinswleiddio (rhagdybiaeth)": "pitched, insulated (assumed)",
"ar oleddf, wedigçöi inswleiddio": "pitched, insulated",
"ar oleddf, wedi?i inswleiddio": "pitched, insulated",
"ar oleddf, inswleiddio cyfyngedig (rhagdybiaeth)": "pitched, limited insulation (assumed)",
"ar oleddf, inswleiddio cyfyngedig": "pitched, limited insulation",
"ar oleddf, wedigçöi inswleiddio wrth y trawstiau": 'pitched, insulated at rafters',
"ar oleddf, wedi?i inswleiddio wrth y trawstiau": 'pitched, insulated at rafters',
"ar oleddf, wedi?i inswleiddio wrth y trawstia": 'pitched, insulated at rafters',
"ar oleddf, wedigçöi inswleiddio wrth y trawstia": 'pitched, insulated at rafters',
"yn wastad, inswleiddio cyfyngedig (rhagdybiaeth)": "flat, limited insulation (assumed)",
"yn wastad, inswleiddio cyfyngedig": "flat, limited insulation",
"yn wastad, dim inswleiddio (rhagdybiaeth)": "flat, no insulation (assumed)",
"yn wastad, dim inswleiddio": "flat, no insulation",
"yn wastad, wedigçöi inswleiddio (rhagdybiaeth)": "flat, insulated (assumed)",
"yn wastad, wedi?i hinswleiddio (rhagdybiaeth)": "flat, insulated (assumed)",
"yn wastad, wedigçöi inswleiddio": "flat, insulated",
"(eiddo arall uwchben)": "(another dwelling above)",
"(annedd arall uwchben)": "(another dwelling above)",
"ystafell(oedd) to, wedigçöi hinswleiddio": "roof room(s), insulated",
"ystafell(oedd) to, wedi?i hinswleiddio (rhagdybiaeth)": "roof room(s), insulated (assumed)",
"ystafell(oedd) to, wedigçöi hinswleiddio (rhagdybiaeth)": "roof room(s), insulated (assumed)",
"ystafell(oedd) to, inswleiddio cyfyngedig (rhagdybiaeth)": "roof room(s), limited insulation (assumed)",
"ystafell(oedd) to, inswleiddio cyfyngedig": "roof room(s), limited insulation",
"ystafell(oedd) to, nenfwd wedigçöi inswleiddio": "roof room(s), ceiling insulated",
"ystafell(oedd) to, dim inswleiddio (rhagdybiaeth)": "roof room(s), no insulation (assumed)",
"ystafell(oedd) to, dim inswleiddio": "roof room(s), no insulation",
"to gwellt, gydag inswleiddio ychwanegol": "thatched, with additional insulation",
}
DEFAULT_KEYS = [
'thermal_transmittance', 'thermal_transmittance_unit', 'is_pitched', 'is_roof_room',
'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', 'has_dwelling_above',
'is_valid', 'insulation_thickness'
]
def __init__(self, description: str):
"""
:param description: Description of the roof.
"""
self.description: str = description.lower().strip()
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or self.description == "sap05:roof"
self.welsh_translation_search()
if not self.nodata and not any(
rt in self.description for rt in self.ROOF_TYPES + self.DWELLING_ABOVE + ["average thermal transmittance"]
):
raise ValueError('Invalid description')
def welsh_translation_search(self):
"""
For some descriptions,
we want to translate, however they have a consistent structure, where the only change
is the thickness of insulation. Instead of manually adding a record for each translation, we
search for regular expressions and translate
"""
loft_insulation_regexes = [
r"ar oleddf, (\d+ mm) o inswleiddio yn y llofft",
r"ar oleddf, (\d+ mm) lo inswleiddio yn y llof",
r"ar oleddf, (\d+\+ mm) lo inswleiddio yn y llof",
r"ar oleddf, (\d+mm) o inswleiddio yn y llofft",
r"ar oleddf, (\d+\+ mm) o inswleiddio yn y llofft"
]
li_thickness_match = None
for regex in loft_insulation_regexes:
li_thickness_match = re.search(regex, self.description)
if li_thickness_match:
break
uvalue_search = re.search(r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k", self.description)
uvalue_search2 = re.search(
r'trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k', self.description, re.IGNORECASE
)
# Step 2: Generalized translation with placeholder
if li_thickness_match is not None:
insulation_thickness = li_thickness_match.group(1)
self.description = f"pitched, {insulation_thickness} loft insulation"
elif uvalue_search is not None or uvalue_search2 is not None:
if uvalue_search is not None:
uvalue = uvalue_search.group(1)
else:
uvalue = uvalue_search2.group(1)
self.description = f"average thermal transmittance {uvalue} W/m-¦K"
else:
translation = self.WELSH_TEXT.get(self.description)
if translation:
self.nodata = False
self.description = translation
def process(self) -> Dict[str, Union[float, str, bool, None]]:
result: Dict[str, Union[float, str, bool, None]] = {}
if self.nodata:
for key in self.DEFAULT_KEYS:
result[key] = False
return result
description = self.description
# thermal transmittance
result, description = extract_thermal_transmittance(result, description)
# roof type
result, description = extract_component_types(result, description, list_of_components=self.ROOF_TYPES)
result["has_dwelling_above"] = any([x in description for x in self.DWELLING_ABOVE])
for dwelling_above in self.DWELLING_ABOVE:
description = description.replace(dwelling_above, "")
result["is_valid"] = "invalid" not in description
description = description.replace("invalid", "")
# We handle an edge case where the description is "pitched, 150 loft insulation" and is missing the mm
if result["is_pitched"] or result["is_loft"]:
# Search for a regular expression that matches 150 insulation
match = re.search(r"(\d+\+?)\s*insulation", description)
if match:
result['insulation_thickness'] = match.group(1)
# insulation thickness
thickness_map = {
"ceiling insulated": "average",
"insulated": "average",
"limited": "below average",
"no insulation": "none",
"limited insulation": "below average",
"additional insulation": "above average",
}
for key, value in thickness_map.items():
if key in description:
result['insulation_thickness'] = value
# Remove the match from the description
# description = description.replace(key, "")
break
# Extract insulation thickness in mm, if present
match = re.search(r'(\d+\+?)\s*mm', description)
if match:
result['insulation_thickness'] = match.group(1)
if "insulation_thickness" not in result:
result['insulation_thickness'] = None
if result["has_dwelling_above"]:
result["thermal_transmittance"] = 0
result["thermal_transmittance_unit"] = 'w/m-¦k'
return result