diff --git a/epc_data/app.py b/epc_data/app.py index cebd9ab8..f9d21beb 100644 --- a/epc_data/app.py +++ b/epc_data/app.py @@ -40,13 +40,13 @@ def handler(): cleaner.clean() # For testing: - from epc_data.attributes.MainheatControlAttributes import MainheatControlAttributes + from epc_data.attributes.MainFuelAttributes import MainFuelAttributes from collections import Counter count = Counter([x["main-fuel"] for x in data]) descriptions = {x["main-fuel"] for x in data} out = [] for description in descriptions: - res = MainheatControlAttributes(description).process() + res = MainFuelAttributes(description).process() out.append( { "original_description": description, diff --git a/epc_data/attributes/MainFuelAttributes.py b/epc_data/attributes/MainFuelAttributes.py index 1be67f23..78dd6ba6 100644 --- a/epc_data/attributes/MainFuelAttributes.py +++ b/epc_data/attributes/MainFuelAttributes.py @@ -1,37 +1,25 @@ from typing import Dict, Union -from epc_data.attributes.attribute_utils import clean_description +from epc_data.attributes.attribute_utils import clean_description, remove_punctuation class MainFuelAttributes: - COMMUNITY_FUEL_KEYWORDS = [ - 'from heat network data (community)', - 'mains gas (community)', - 'electricity (community)', - 'oil (community)', - 'biomass (community)', - 'heat from boilers using biodiesel from any biomass source (community)', - 'LPG (community)', - 'waste combustion (community)' - ] - - NOT_COMMUNITY_FUEL_KEYWORDS = [ - 'mains gas (not community)', - 'electricity (not community)', - 'oil (not community)', - 'house coal (not community)', - 'LPG (not community)', - 'biogas (not community)' - ] - - SPECIFIC_FUEL_KEYWORDS = [ - 'waste combustion - this is for backwards compatibility only and should not be used', - 'bottled LPG', + FUEL_KEYWORDS = [ + 'heat network', + 'mains gas', + 'electricity', + 'oil', + 'biomass', + 'biodiesel', + # Note: there is als a category called 'bottled LPG', but only 2/50k entries had this + 'LPG', + 'waste combustion', + 'biogas', 'wood logs', 'dual fuel - mineral + wood', - 'Gas: mains gas', - 'Electricity: electricity, unspecified tariff', + 'gas', 'anthracite', - 'smokeless coal' + 'smokeless coal', + 'house coal' ] NO_FUEL_KEYWORDS = [ @@ -39,16 +27,16 @@ class MainFuelAttributes: ] def __init__(self, description: str): - self.description: str = clean_description(description.lower()) + self.description: str = remove_punctuation(clean_description(description.lower())) self.nodata = not description or any(keyword in self.description for keyword in self.NO_FUEL_KEYWORDS) + self.is_community = False if 'not community' in self.description else 'community' in self.description + if not self.nodata and not any( self._keyword_in_description(keywords) for keywords in [ - self.COMMUNITY_FUEL_KEYWORDS, - self.NOT_COMMUNITY_FUEL_KEYWORDS, - self.SPECIFIC_FUEL_KEYWORDS + self.FUEL_KEYWORDS ] ): raise ValueError('Invalid description') @@ -58,10 +46,9 @@ class MainFuelAttributes: def process(self) -> Dict[str, Union[str, bool]]: result: Dict[str, Union[str, bool]] = { - "community_fuel": self._find_keyword(self.COMMUNITY_FUEL_KEYWORDS), - "not_community_fuel": self._find_keyword(self.NOT_COMMUNITY_FUEL_KEYWORDS), - "specific_fuel": self._find_keyword(self.SPECIFIC_FUEL_KEYWORDS), + "fuel_type": self._find_keyword(self.FUEL_KEYWORDS), "no_fuel": self._find_keyword(self.NO_FUEL_KEYWORDS), + "is_community": self.is_community } return result