diff --git a/model_data/app.py b/model_data/app.py index b49e3fd5..42ec1c9b 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -97,8 +97,7 @@ def handler(): ) ) - test = [x for x in data if "Conservatory" in x["floor-description"]] - test = pd.DataFrame(test) + z = pd.DataFrame([x for x in data if x["hotwater-description"] == "From second main heating system"]) # Incorporate input data into cleaning cleaner = EpcClean(data + [p.data for p in input_properties]) diff --git a/model_data/epc_attributes/HotWaterAttributes.py b/model_data/epc_attributes/HotWaterAttributes.py index 760d12b1..3da3f9b0 100644 --- a/model_data/epc_attributes/HotWaterAttributes.py +++ b/model_data/epc_attributes/HotWaterAttributes.py @@ -22,6 +22,7 @@ class HotWaterAttributes(BaseUtility): 'from main system', # The hot water is provided by the main heating system of the building 'from secondary system', # The hot water is provided by a secondary (or supplementary) heating system in the building + 'from second main heating system', # Same as 'from secondary system' 'community scheme', # The hot water is provided by a community heating system ] @@ -83,6 +84,11 @@ class HotWaterAttributes(BaseUtility): 'gas range cooker', # A gas-powered range cooker ] + # Descriptions which represent the same thing + SYNONYMS = { + 'from second main heating system': 'from secondary system', + } + def __init__(self, description: str): self.description: str = clean_description(description.lower()) @@ -128,7 +134,7 @@ class HotWaterAttributes(BaseUtility): result: Dict[str, Union[str, bool]] = { "heater_type": find_keyword(self.description, self.HEATER_TYPES), - "system_type": find_keyword(self.description, self.SYSTEM_TYPES), + "system_type": find_keyword(self.description, self.SYSTEM_TYPES, self.SYNONYMS), "thermostat_characteristics": find_keyword(self.description, self.THERMOSTAT_CHARACTERISTICS), "heating_scope": find_keyword(self.description, self.HEATING_SCOPE), "energy_recovery": find_keyword(self.description, self.ENERGY_RECOVERY), diff --git a/model_data/epc_attributes/attribute_utils.py b/model_data/epc_attributes/attribute_utils.py index c0816943..b7140ab1 100644 --- a/model_data/epc_attributes/attribute_utils.py +++ b/model_data/epc_attributes/attribute_utils.py @@ -118,7 +118,10 @@ def remove_double_spaces(text): return cleaned_text -def find_keyword(description, keywords): +def find_keyword(description, keywords, synonyms=None): + if synonyms is None: + synonyms = {} + # Sort keywords by length, longest first. # This ensures that 'time and temperature zone control' # will be checked before 'temperature zone control' if both are present in the keywords list @@ -126,13 +129,13 @@ def find_keyword(description, keywords): for keyword in keywords: if keyword in description: - return keyword + return synonyms.get(keyword, keyword) # If no keyword is found, try again after removing punctuation description_without_punct = remove_punctuation(description) for keyword in keywords: if keyword in description_without_punct: - return keyword + return synonyms.get(keyword, keyword) return None diff --git a/model_data/tests/test_data/test_hot_water_attributes_cases.py b/model_data/tests/test_data/test_hot_water_attributes_cases.py index e85f566a..18d69733 100644 --- a/model_data/tests/test_data/test_hot_water_attributes_cases.py +++ b/model_data/tests/test_data/test_hot_water_attributes_cases.py @@ -120,5 +120,10 @@ hotwater_cases = [ {'original_description': 'Gas range cooker', 'heater_type': None, 'system_type': None, 'thermostat_characteristics': None, 'heating_scope': None, 'energy_recovery': None, 'tariff_type': None, 'extra_features': None, 'chp_systems': None, 'distribution_system': None, 'no_system_present': None, - 'assumed': False, "appliance": "gas range cooker"} + 'assumed': False, "appliance": "gas range cooker"}, + {'original_description': 'From second main heating system', 'heater_type': None, + 'system_type': 'from secondary system', + 'thermostat_characteristics': None, 'heating_scope': None, 'energy_recovery': None, 'tariff_type': None, + 'extra_features': None, 'chp_systems': None, 'distribution_system': None, 'no_system_present': None, + 'assumed': False, "appliance": None} ]