Model/model_data/EpcClean.py

from typing import List, Dict, Any
from collections import Counter
from collections import defaultdict

from model_data.utils import correct_spelling
from model_data.epc_attributes.FloorAttributes import FloorAttributes
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from model_data.epc_attributes.RoofAttributes import RoofAttributes
from model_data.epc_attributes.WallAttributes import WallAttributes
from model_data.epc_attributes.WindowAttributes import WindowAttributes
from model_data.epc_attributes.LightingAttributes import LightingAttributes


class EpcClean:
    """
    Container for methods which we utilise for epc_attributes EPC data
    """

    CLEANING_FIELDS: List[str] = [
        "floor-description",
        "hotwater-description",
        "main-fuel",
        "mainheat-description",
        "mainheatcont-description",
        "roof-description",
        "walls-description",
        "windows-description",
        "lighting-description"
    ]

    def __init__(self, data: List[Dict[str, Any]],
                 lighting_averages: List[Dict[str, str | float]] | None = None) -> None:
        """
        EpcClean constructor.

        :param data: List of dictionaries containing EPC data.
        """
        self.data: List[Dict[str, Any]] = data
        self.unique_vals: Dict[str, Any] = {}
        self.cleaned: Dict[str, List[Any]] = {}

        if not lighting_averages:
            self.lighting_averages = self._calculate_lighting_averages()
        else:
            self.lighting_averages = lighting_averages

    def _calculate_lighting_averages(self):

        """
        This is a simple utility function that for few textual lighting descriptions, will calculate the average
        low energy lighting proportion. This is only valid for a very tiny number of cases and so a very simple
        methodology is applied

        This is done without pandas so we can utilise this inside of our lambdas

        :return: list of avergages for the corresponding descriptions
        """

        data = self.data

        # Filter rows with the specified lighting descriptions
        filtered_data = [
            row for row in data if row["lighting-description"] in [
                'Below average lighting efficiency',
                'Good lighting efficiency',
                'Excelent lighting efficiency'
            ]
        ]

        # Convert low-energy-lighting to float
        for row in filtered_data:
            row["low-energy-lighting"] = float(row["low-energy-lighting"])

        # Calculate averages
        sums = defaultdict(float)
        counts = defaultdict(int)

        for row in filtered_data:
            description = row["lighting-description"]
            sums[description] += row["low-energy-lighting"]
            counts[description] += 1

        averages = [{
            "lighting-description": correct_spelling(description.lower()),
            "low-energy-lighting": total / counts[description]
        } for description, total in sums.items()]

        return averages

    def clean(self) -> None:
        """
        Cleans the EPC data, mapping text fields to property epc_attributes.
        """
        self._init_empty_cleaned_obj()

        for field in self.CLEANING_FIELDS:
            self.unique_vals[field] = Counter([v[field] for v in self.data])

        self.clean_wrapper(field="floor-description", cleaning_cls=FloorAttributes)
        self.clean_wrapper(field="hotwater-description", cleaning_cls=HotWaterAttributes)
        self.clean_wrapper(field="main-fuel", cleaning_cls=MainFuelAttributes)
        self.clean_wrapper(field="mainheat-description", cleaning_cls=MainHeatAttributes)
        self.clean_wrapper(field="mainheatcont-description", cleaning_cls=MainheatControlAttributes)
        self.clean_wrapper(field="roof-description", cleaning_cls=RoofAttributes)
        self.clean_wrapper(field="walls-description", cleaning_cls=WallAttributes)
        self.clean_wrapper(field="windows-description", cleaning_cls=WindowAttributes)

        self.clean_wrapper(
            field="lighting-description", cleaning_cls=LightingAttributes, averages=self.lighting_averages
        )

    def _init_empty_cleaned_obj(self) -> None:
        """
        Initializes an empty object for cleaned data.
        """
        self.cleaned = {field: [] for field in self.CLEANING_FIELDS}

    def clean_wrapper(self, field, cleaning_cls, **kwargs):
        for description in self.unique_vals[field].keys():
            cln = cleaning_cls(description, **kwargs)

            self.cleaned[field].append(
                {
                    "original_description": description,
                    "clean_description": cln.description.capitalize(),
                    **cln.process()
                }
            )