Model/model_data/EpcClean.py
2023-07-03 15:18:44 +01:00

106 lines
4.2 KiB
Python

from typing import List, Dict, Any
from collections import Counter
import pandas as pd
from model_data.epc_attributes.FloorAttributes import FloorAttributes
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from model_data.epc_attributes.RoofAttributes import RoofAttributes
from model_data.epc_attributes.WallAttributes import WallAttributes
from model_data.epc_attributes.WindowAttributes import WindowAttributes
from model_data.epc_attributes.LightingAttributes import LightingAttributes
class EpcClean:
"""
Container for methods which we utilise for epc_attributes EPC data
"""
CLEANING_FIELDS: List[str] = [
"floor-description",
"hotwater-description",
"main-fuel",
"mainheat-description",
"mainheatcont-description",
"roof-description",
"walls-description",
"windows-description",
"lighting-description"
]
def __init__(self, data: List[Dict[str, Any]]) -> None:
"""
EpcClean constructor.
:param data: List of dictionaries containing EPC data.
"""
self.data: List[Dict[str, Any]] = data
self.unique_vals: Dict[str, Any] = {}
self.cleaned: Dict[str, List[Any]] = {}
self.lighting_averages = self._calculate_lighting_averages()
def _calculate_lighting_averages(self):
"""
This is a simple utility function that for few textual lighting descritpions, will calculate the average
low energy lighting proportion. This is only valid for a very tiny number of cases and so a very simple
methodology is applied
:return: Dataframe of avergages for the corresponding descriptions
"""
df = pd.DataFrame(self.data)
aggs = df[
df["lighting-description"].isin(
[
'Below average lighting efficiency',
'Good lighting efficiency',
'Excelent lighting efficiency'
]
)
].copy()
aggs["low-energy-lighting"] = aggs["low-energy-lighting"].astype(float)
averages = aggs.groupby("lighting-description")["low-energy-lighting"].mean().reset_index()
averages["lighting-description"] = averages["lighting-description"].str.lower()
return averages
def clean(self) -> None:
"""
Cleans the EPC data, mapping text fields to property epc_attributes.
"""
self._init_empty_cleaned_obj()
for field in self.CLEANING_FIELDS:
self.unique_vals[field] = Counter([v[field] for v in self.data])
self.clean_wrapper(field="floor-description", cleaning_cls=FloorAttributes)
self.clean_wrapper(field="hotwater-description", cleaning_cls=HotWaterAttributes)
self.clean_wrapper(field="main-fuel", cleaning_cls=MainFuelAttributes)
self.clean_wrapper(field="mainheat-description", cleaning_cls=MainHeatAttributes)
self.clean_wrapper(field="mainheatcont-description", cleaning_cls=MainheatControlAttributes)
self.clean_wrapper(field="roof-description", cleaning_cls=RoofAttributes)
self.clean_wrapper(field="walls-description", cleaning_cls=WallAttributes)
self.clean_wrapper(field="windows-description", cleaning_cls=WindowAttributes)
self.clean_wrapper(
field="lighting-description", cleaning_cls=LightingAttributes, averages=self.lighting_averages
)
def _init_empty_cleaned_obj(self) -> None:
"""
Initializes an empty object for cleaned data.
"""
self.cleaned = {field: [] for field in self.CLEANING_FIELDS}
def clean_wrapper(self, field, cleaning_cls, **kwargs):
for description in self.unique_vals[field].keys():
self.cleaned[field].append(
{
"original_description": description,
**cleaning_cls(description, **kwargs).process()
}
)