From d5e4baba054b4f6da748fa329b1b19faf00e0d83 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Jul 2023 15:07:36 +0100 Subject: [PATCH] rough implementation of lighing description cleaning --- model_data/EpcClean.py | 38 ++++++++++++++++++- model_data/analysis/SapModel.py | 8 ++++ .../epc_attributes/LightingAttributes.py | 29 ++++++++++++++ 3 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 model_data/epc_attributes/LightingAttributes.py diff --git a/model_data/EpcClean.py b/model_data/EpcClean.py index 03cc3a4e..167d1640 100644 --- a/model_data/EpcClean.py +++ b/model_data/EpcClean.py @@ -1,6 +1,8 @@ from typing import List, Dict, Any from collections import Counter +import pandas as pd + from model_data.epc_attributes.FloorAttributes import FloorAttributes from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes @@ -9,6 +11,7 @@ from model_data.epc_attributes.MainheatControlAttributes import MainheatControlA from model_data.epc_attributes.RoofAttributes import RoofAttributes from model_data.epc_attributes.WallAttributes import WallAttributes from model_data.epc_attributes.WindowAttributes import WindowAttributes +from model_data.epc_attributes.LightingAttributes import LightingAttributes class EpcClean: @@ -37,6 +40,33 @@ class EpcClean: self.unique_vals: Dict[str, Any] = {} self.cleaned: Dict[str, List[Any]] = {} + self.lighting_averages = self._calculate_lighting_averages() + + def _calculate_lighting_averages(self): + + """ + This is a simple utility function that for few textual lighting descritpions, will calculate the average + low energy lighting proportion. This is only valid for a very tiny number of cases and so a very simple + methodology is applied + :return: Dataframe of avergages for the corresponding descriptions + """ + + df = pd.DataFrame(self.data) + aggs = df[ + df["lighting-description"].isin( + [ + 'Below average lighting efficiency', + 'Good lighting efficiency', + 'Excelent lighting efficiency' + ] + ) + ] + aggs["low-energy-lighting"] = aggs["low-energy-lighting"].astype(float) + + averages = aggs.groupby("lighting-description")["low-energy-lighting"].mean().reset_index() + averages["lighting-description"] = averages["lighting-description"].str.lower() + return averages + def clean(self) -> None: """ Cleans the EPC data, mapping text fields to property epc_attributes. @@ -55,17 +85,21 @@ class EpcClean: self.clean_wrapper(field="walls-description", cleaning_cls=WallAttributes) self.clean_wrapper(field="windows-description", cleaning_cls=WindowAttributes) + self.clean_wrapper( + field="lighting-description", cleaning_cls=LightingAttributes, averages=self.lighting_averages + ) + def _init_empty_cleaned_obj(self) -> None: """ Initializes an empty object for cleaned data. """ self.cleaned = {field: [] for field in self.CLEANING_FIELDS} - def clean_wrapper(self, field, cleaning_cls): + def clean_wrapper(self, field, cleaning_cls, **kwargs): for description in self.unique_vals[field].keys(): self.cleaned[field].append( { "original_description": description, - **cleaning_cls(description).process() + **cleaning_cls(description, **kwargs).process() } ) diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index 4832a2e8..f764bd00 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -264,3 +264,11 @@ self = SalModel( data=all_data["data"], cleaner=all_data["cleaner"] ) + +descs = [] +for x in all_data["data"]: + descs.append(x["lighting-description"]) + +descs = list(set(descs)) + +df = pd.DataFrame(all_data['data']) diff --git a/model_data/epc_attributes/LightingAttributes.py b/model_data/epc_attributes/LightingAttributes.py new file mode 100644 index 00000000..49f39ef3 --- /dev/null +++ b/model_data/epc_attributes/LightingAttributes.py @@ -0,0 +1,29 @@ +import re +from model_data.epc_attributes.attribute_utils import clean_description + + +class LightingAttributes: + def __init__(self, description, averages): + self.description: str = clean_description(description.lower()) + self.averages = averages + + def low_energy_proportions(self): + + description = self.description + + if 'no low energy lighting' in description: + return 0 + + if "all fixed outlets" in description: + return 1 + + if ('good lighting efficiency' in description) or ('excellent lighting efficiency' in description) or \ + ('below average lighting efficiency' in description): + return self.averages[self.averages == description]["low-energy-lighting"].values[0] + + match = re.search(r'\d+', description) + if match: + proportion = int(match.group()) / 100.0 + return proportion + + return 0