Model/etl/epc_clean/epc_attributes/WindowAttributes.py

from typing import Dict, Union
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, handle_mixed_translation


class WindowAttributes(Definitions):
    GLAZING_KEYWORDS = ["glazing", "glazed", "glaze"]
    GLAZING_COVERAGE = ["fully", "mostly", "partial", "some", "full", "thoughout"]
    GLAZING_TYPES = ["double", "triple", "secondary", "multiple", "high performance", "single"]

    coverage_map = {
        "full": "full",
        "fully": "full",
        "mostly": "most",
        "partial": "partial",
        "some": "partial",
        "throughout": "full"
    }

    WELSH_TEXT = {
        "gwydrau dwbl llawn": "full double glazing",
        "gwydrau dwbl rhannol": "partial double glazing",
        "gwydrau dwbl gan mwyaf": "mostly double glazing",
        "rhai gwydrau dwbl": "some double glazing",
        "gwydrau sengl": "single glazed",
        "ffenestri perfformiad uchel": "high performance glazing",
        "gwydrau triphlyg llawn": "fully triple glazed",
        "gwydrau triphlyg rhannol": "partial triple glazed",
        "gwydrau triphlyg mwyaf": "mostly triple glazed",
        "gwydrau triphlyg gan mwyaf": "mostly triple glazed",
        "gwydrau eilaidd llawn": "full secondary glazing",
        "gwydrau eilaidd mwyaf": "mostly secondary glazing",
        "gwydrau eilaidd rhannol": "partial secondary glazing",
        "gwydrau lluosog ym mhobman": "multiple glazing throughout",
    }

    # These are observed data anomalies that we want to ignore
    NO_DATA_CASES = [
        "SAP05:Windows",
        "Solid, no insulation (assumed)",  # A description typically associated with floors, not windows
        "Suspended, no insulation (assumed)",  # A description typically associated with floors, not windows
    ]

    def __init__(self, description: str):
        self.description: str = clean_description(description.lower())

        # In the case of an empty description, we want to return a dictionary with all values set to False
        # and indicate there was no data
        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description in self.NO_DATA_CASES

        translation = self.WELSH_TEXT.get(self.description)
        if translation:
            self.nodata = False
            self.description = translation

        # We handle seemind occurances of mixed translations
        self.description = handle_mixed_translation(self.description)

        if not self.nodata:
            if not any(
                rt in self.description for rt in
                self.GLAZING_KEYWORDS + self.GLAZING_COVERAGE + self.GLAZING_TYPES
            ):
                raise ValueError('Invalid description')

    def process(self) -> Dict[str, Union[str, bool]]:
        result: Dict[str, Union[str, bool, None]] = {
            "has_glazing": False,
            "glazing_coverage": None,
            "glazing_type": None,
            "no_data": self.nodata
        }

        if self.nodata:
            return result

        # We consolidate GLAZING_KEYWORDS into a single attribute
        result["has_glazing"] = any(keyword in self.description for keyword in self.GLAZING_KEYWORDS)

        # For coverage and type, we will only store the first one we find
        for part in self.description.split(','):
            part = part.strip()  # remove leading/trailing white spaces
            if not result["glazing_coverage"]:
                for coverage in self.GLAZING_COVERAGE:
                    if coverage in part:
                        result["glazing_coverage"] = self.coverage_map[coverage]
                        break
            if not result["glazing_type"]:
                for glazing_type in self.GLAZING_TYPES:
                    if glazing_type in part:
                        result["glazing_type"] = glazing_type
                        break

        # If we didn't find any coverage or type, we assume full coverage
        if (not result["glazing_coverage"]) & (result["glazing_type"] != "single"):
            result["glazing_coverage"] = "full"

        # We reset some values if the glazing is single
        if result["glazing_type"] == "single":
            result["has_glazing"] = False

        return result