rough implementation of lighing description cleaning

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-03 15:07:36 +01:00
parent ba201c8b6a
commit d5e4baba05
3 changed files with 73 additions and 2 deletions

View file

@ -1,6 +1,8 @@
from typing import List, Dict, Any
from collections import Counter
import pandas as pd
from model_data.epc_attributes.FloorAttributes import FloorAttributes
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
@ -9,6 +11,7 @@ from model_data.epc_attributes.MainheatControlAttributes import MainheatControlA
from model_data.epc_attributes.RoofAttributes import RoofAttributes
from model_data.epc_attributes.WallAttributes import WallAttributes
from model_data.epc_attributes.WindowAttributes import WindowAttributes
from model_data.epc_attributes.LightingAttributes import LightingAttributes
class EpcClean:
@ -37,6 +40,33 @@ class EpcClean:
self.unique_vals: Dict[str, Any] = {}
self.cleaned: Dict[str, List[Any]] = {}
self.lighting_averages = self._calculate_lighting_averages()
def _calculate_lighting_averages(self):
"""
This is a simple utility function that for few textual lighting descritpions, will calculate the average
low energy lighting proportion. This is only valid for a very tiny number of cases and so a very simple
methodology is applied
:return: Dataframe of avergages for the corresponding descriptions
"""
df = pd.DataFrame(self.data)
aggs = df[
df["lighting-description"].isin(
[
'Below average lighting efficiency',
'Good lighting efficiency',
'Excelent lighting efficiency'
]
)
]
aggs["low-energy-lighting"] = aggs["low-energy-lighting"].astype(float)
averages = aggs.groupby("lighting-description")["low-energy-lighting"].mean().reset_index()
averages["lighting-description"] = averages["lighting-description"].str.lower()
return averages
def clean(self) -> None:
"""
Cleans the EPC data, mapping text fields to property epc_attributes.
@ -55,17 +85,21 @@ class EpcClean:
self.clean_wrapper(field="walls-description", cleaning_cls=WallAttributes)
self.clean_wrapper(field="windows-description", cleaning_cls=WindowAttributes)
self.clean_wrapper(
field="lighting-description", cleaning_cls=LightingAttributes, averages=self.lighting_averages
)
def _init_empty_cleaned_obj(self) -> None:
"""
Initializes an empty object for cleaned data.
"""
self.cleaned = {field: [] for field in self.CLEANING_FIELDS}
def clean_wrapper(self, field, cleaning_cls):
def clean_wrapper(self, field, cleaning_cls, **kwargs):
for description in self.unique_vals[field].keys():
self.cleaned[field].append(
{
"original_description": description,
**cleaning_cls(description).process()
**cleaning_cls(description, **kwargs).process()
}
)

View file

@ -264,3 +264,11 @@ self = SalModel(
data=all_data["data"],
cleaner=all_data["cleaner"]
)
descs = []
for x in all_data["data"]:
descs.append(x["lighting-description"])
descs = list(set(descs))
df = pd.DataFrame(all_data['data'])

View file

@ -0,0 +1,29 @@
import re
from model_data.epc_attributes.attribute_utils import clean_description
class LightingAttributes:
def __init__(self, description, averages):
self.description: str = clean_description(description.lower())
self.averages = averages
def low_energy_proportions(self):
description = self.description
if 'no low energy lighting' in description:
return 0
if "all fixed outlets" in description:
return 1
if ('good lighting efficiency' in description) or ('excellent lighting efficiency' in description) or \
('below average lighting efficiency' in description):
return self.averages[self.averages == description]["low-energy-lighting"].values[0]
match = re.search(r'\d+', description)
if match:
proportion = int(match.group()) / 100.0
return proportion
return 0