diff --git a/epc_data/app.py b/epc_data/app.py index efa830ee..ea14882f 100644 --- a/epc_data/app.py +++ b/epc_data/app.py @@ -40,11 +40,11 @@ def handler(): cleaner.clean() # For testing: - from epc_data.attributes.FloorAttributes import FloorAttributes - descriptions = {x["floor-description"] for x in data} + from epc_data.attributes.MainheatAttributes import MainHeatAttributes + descriptions = {x["mainheat-description"] for x in data} out = [] for description in descriptions: - res = FloorAttributes(description).process() + res = MainHeatAttributes(description).process() out.append( { "original_description": description, @@ -53,5 +53,29 @@ def handler(): ) df = pd.DataFrame(out) df = df.sort_values("original_description") + df = df.reset_index(drop=True) + + z = df[df["original_description"] == 'Air source heat pump, radiators and underfloor, electric'] + + # Up to index: 14 + # Bugs: + # 1) + # Description: 'Air source heat pump fan coil units, electric' + # Issue: Because "oil" is a fuel type, "oil" is stripped out of the description and the description + # gets converted to "fan c units". It also marks this description as having oil, which it doesn't + # So this code probably won't detect any "fan coil units" + # 2) + # Description: 'Air source heat pump, Systems with radiators, electric' + # Issue: Check detecton of Systems with radiators - it's only searching for "radiators" in DISTRIBUTION_SYSTEMS + # This may actually be fine as we have other descriptions such as + # 'Air source heat pump, Underfloor heating and radiators, pipes in insulated timber floor, electric' + # 3) + # Description: 'Air source heat pump, radiators and underfloor, electric' + # Issue: We don't have any logic which identifies this heating system has having underfloor heating. + # Currently, we look for "electric underfloor heating" and "underfloor heating" so we miss + # the underfloor characterisation. There are a few descriptions that just include "underfloor" + # e.g. 'Air source heat pump, radiators, electric' which will get missed + # 4) + # df.to_dict("records") diff --git a/epc_data/attributes/MainheatAttributes.py b/epc_data/attributes/MainheatAttributes.py new file mode 100644 index 00000000..ce0c9dc7 --- /dev/null +++ b/epc_data/attributes/MainheatAttributes.py @@ -0,0 +1,64 @@ +from typing import Dict, List, Union + + +class MainHeatAttributes: + HEAT_SYSTEMS = ["boiler", "air source heat pump", "room heaters", "electric storage heaters", "warm air", + "electric underfloor heating", "electric ceiling heating", "community scheme"] + FUEL_TYPES = ["electric", "mains gas", "wood logs", "LPG", "coal", "oil", "wood pellets", "anthracite", + "dual fuel (mineral and wood)", "smokeless fuel"] + DISTRIBUTION_SYSTEMS = ["underfloor heating", "radiators", "fan coil units", "pipes in screed above insulation", + "pipes in insulated timber floor", "pipes in concrete slab"] + OTHERS = ["assumed", "Electricaire"] + + def __init__(self, description: str): + self.description: str = description.lower() + + if not description or not any( + rt in self.description for rt in + self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS + ): + raise ValueError('Invalid description') + + def process(self) -> Dict[str, Union[str, bool]]: + result: Dict[str, Union[str, bool]] = {} + description = self.description.split(',') + + # Process each part separately + for part in description: + part = part.strip() # remove leading/trailing white spaces + + # Heating Systems + self._process_part(result, part, self.HEAT_SYSTEMS, 'has_') + + # Fuel Types + self._process_part(result, part, self.FUEL_TYPES, 'has_') + + # Distribution Systems + self._process_part(result, part, self.DISTRIBUTION_SYSTEMS, 'has_') + + # Other attributes + self._process_part(result, part, self.OTHERS, 'has_') + + return result + + @staticmethod + def _process_part(result: Dict[str, Union[str, bool]], part: str, attr_list: List[str], prefix: str): + """ + Process a part of the description with a given list of attributes + and update the result dictionary. + """ + part_words = part.split() + for attr in attr_list: + attr_words = attr.split() + if set(attr_words).issubset(set(part_words)): + result[f'{prefix}{attr.replace(" ", "_")}'] = True + for word in attr_words: + part_words.remove(word) # remove the attribute words from part + + part = " ".join(part_words) + + # Check for variations of "underfloor heating" + if "underfloor" in part.split(): + result[f'{prefix}underfloor_heating'] = True + + return result diff --git a/epc_data/tests/test_epc_clean.py b/epc_data/tests/test_epc_clean.py index e8e8fd24..5b552c0d 100644 --- a/epc_data/tests/test_epc_clean.py +++ b/epc_data/tests/test_epc_clean.py @@ -2,8 +2,6 @@ import pytest import pickle from epc_data.EpcClean import EpcClean from pathlib import Path -from epc_data.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases -from epc_data.attributes.RoofAttributes import RoofAttributes # For local testing if __file__ == "":