from typing import List, Dict, Any from collections import Counter from collections import defaultdict from model_data.utils import correct_spelling from model_data.epc_attributes.FloorAttributes import FloorAttributes from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes from model_data.epc_attributes.RoofAttributes import RoofAttributes from model_data.epc_attributes.WallAttributes import WallAttributes from model_data.epc_attributes.WindowAttributes import WindowAttributes from model_data.epc_attributes.LightingAttributes import LightingAttributes class EpcClean: """ Container for methods which we utilise for epc_attributes EPC data """ CLEANING_FIELDS: List[str] = [ "floor-description", "hotwater-description", "main-fuel", "mainheat-description", "mainheatcont-description", "roof-description", "walls-description", "windows-description", "lighting-description" ] def __init__(self, data: List[Dict[str, Any]], lighting_averages: List[Dict[str, str | float]] | None = None) -> None: """ EpcClean constructor. :param data: List of dictionaries containing EPC data. """ self.data: List[Dict[str, Any]] = data self.unique_vals: Dict[str, Any] = {} self.cleaned: Dict[str, List[Any]] = {} if not lighting_averages: self.lighting_averages = self._calculate_lighting_averages() else: self.lighting_averages = lighting_averages def _calculate_lighting_averages(self): """ This is a simple utility function that for few textual lighting descriptions, will calculate the average low energy lighting proportion. This is only valid for a very tiny number of cases and so a very simple methodology is applied This is done without pandas so we can utilise this inside of our lambdas :return: list of avergages for the corresponding descriptions """ data = self.data # Filter rows with the specified lighting descriptions filtered_data = [ row for row in data if row["lighting-description"] in [ 'Below average lighting efficiency', 'Good lighting efficiency', 'Excelent lighting efficiency' ] ] # Convert low-energy-lighting to float for row in filtered_data: row["low-energy-lighting"] = float(row["low-energy-lighting"]) # Calculate averages sums = defaultdict(float) counts = defaultdict(int) for row in filtered_data: description = row["lighting-description"] sums[description] += row["low-energy-lighting"] counts[description] += 1 averages = [{ "lighting-description": correct_spelling(description.lower()), "low-energy-lighting": total / counts[description] } for description, total in sums.items()] return averages def clean(self) -> None: """ Cleans the EPC data, mapping text fields to property epc_attributes. """ self._init_empty_cleaned_obj() for field in self.CLEANING_FIELDS: self.unique_vals[field] = Counter([v[field] for v in self.data]) self.clean_wrapper(field="floor-description", cleaning_cls=FloorAttributes) self.clean_wrapper(field="hotwater-description", cleaning_cls=HotWaterAttributes) self.clean_wrapper(field="main-fuel", cleaning_cls=MainFuelAttributes) self.clean_wrapper(field="mainheat-description", cleaning_cls=MainHeatAttributes) self.clean_wrapper(field="mainheatcont-description", cleaning_cls=MainheatControlAttributes) self.clean_wrapper(field="roof-description", cleaning_cls=RoofAttributes) self.clean_wrapper(field="walls-description", cleaning_cls=WallAttributes) self.clean_wrapper(field="windows-description", cleaning_cls=WindowAttributes) self.clean_wrapper( field="lighting-description", cleaning_cls=LightingAttributes, averages=self.lighting_averages ) def _init_empty_cleaned_obj(self) -> None: """ Initializes an empty object for cleaned data. """ self.cleaned = {field: [] for field in self.CLEANING_FIELDS} def clean_wrapper(self, field, cleaning_cls, **kwargs): for description in self.unique_vals[field].keys(): cln = cleaning_cls(description, **kwargs) self.cleaned[field].append( { "original_description": description, "clean_description": cln.description.capitalize(), **cln.process() } )