Model/etl/epc_clean/epc_attributes/WindowAttributes.py
2025-11-30 18:16:09 +00:00

102 lines
4.2 KiB
Python

from typing import Dict, Union
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, handle_mixed_translation
class WindowAttributes(Definitions):
GLAZING_KEYWORDS = ["glazing", "glazed", "glaze"]
GLAZING_COVERAGE = ["fully", "mostly", "partial", "some", "full", "thoughout"]
GLAZING_TYPES = ["double", "triple", "secondary", "multiple", "high performance", "single"]
coverage_map = {
"full": "full",
"fully": "full",
"mostly": "most",
"partial": "partial",
"some": "partial",
"throughout": "full"
}
WELSH_TEXT = {
"gwydrau dwbl llawn": "full double glazing",
"gwydrau dwbl rhannol": "partial double glazing",
"gwydrau dwbl gan mwyaf": "mostly double glazing",
"rhai gwydrau dwbl": "some double glazing",
"gwydrau sengl": "single glazed",
"ffenestri perfformiad uchel": "high performance glazing",
"gwydrau triphlyg llawn": "fully triple glazed",
"gwydrau triphlyg rhannol": "partial triple glazed",
"gwydrau triphlyg mwyaf": "mostly triple glazed",
"gwydrau triphlyg gan mwyaf": "mostly triple glazed",
"gwydrau eilaidd llawn": "full secondary glazing",
"gwydrau eilaidd mwyaf": "mostly secondary glazing",
"gwydrau eilaidd rhannol": "partial secondary glazing",
"gwydrau lluosog ym mhobman": "multiple glazing throughout",
}
# These are observed data anomalies that we want to ignore
NO_DATA_CASES = [
"SAP05:Windows",
"Solid, no insulation (assumed)", # A description typically associated with floors, not windows
"Suspended, no insulation (assumed)", # A description typically associated with floors, not windows
]
def __init__(self, description: str):
self.description: str = clean_description(description.lower())
# In the case of an empty description, we want to return a dictionary with all values set to False
# and indicate there was no data
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description in self.NO_DATA_CASES
translation = self.WELSH_TEXT.get(self.description)
if translation:
self.nodata = False
self.description = translation
# We handle seemind occurances of mixed translations
self.description = handle_mixed_translation(self.description)
if not self.nodata:
if not any(
rt in self.description for rt in
self.GLAZING_KEYWORDS + self.GLAZING_COVERAGE + self.GLAZING_TYPES
):
raise ValueError('Invalid description')
def process(self) -> Dict[str, Union[str, bool]]:
result: Dict[str, Union[str, bool, None]] = {
"has_glazing": False,
"glazing_coverage": None,
"glazing_type": None,
"no_data": self.nodata
}
if self.nodata:
return result
# We consolidate GLAZING_KEYWORDS into a single attribute
result["has_glazing"] = any(keyword in self.description for keyword in self.GLAZING_KEYWORDS)
# For coverage and type, we will only store the first one we find
for part in self.description.split(','):
part = part.strip() # remove leading/trailing white spaces
if not result["glazing_coverage"]:
for coverage in self.GLAZING_COVERAGE:
if coverage in part:
result["glazing_coverage"] = self.coverage_map[coverage]
break
if not result["glazing_type"]:
for glazing_type in self.GLAZING_TYPES:
if glazing_type in part:
result["glazing_type"] = glazing_type
break
# If we didn't find any coverage or type, we assume full coverage
if (not result["glazing_coverage"]) & (result["glazing_type"] != "single"):
result["glazing_coverage"] = "full"
# We reset some values if the glazing is single
if result["glazing_type"] == "single":
result["has_glazing"] = False
return result